egra: added assembler-optimised span blending function for agg
[iv.d.git] / kissfft.d
blobf6f8df31e65d06dcf9af0b54b01c47104e2c97a6
1 /*
2 * Copyright (c) 2003-2010, Mark Borgerding
4 * All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the
16 * distribution.
18 * * Neither the author nor the names of any contributors may be used to
19 * endorse or promote products derived from this software without
20 * specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 /* Ported by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
35 * Understanding is not required. Only obedience.
37 * This program is free software: you can redistribute it and/or modify
38 * it under the terms of the GNU General Public License as published by
39 * the Free Software Foundation, version 3 of the License ONLY.
41 * This program is distributed in the hope that it will be useful,
42 * but WITHOUT ANY WARRANTY; without even the implied warranty of
43 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
44 * GNU General Public License for more details.
46 * You should have received a copy of the GNU General Public License
47 * along with this program. If not, see <http://www.gnu.org/licenses/>.
49 module iv.kissfft;
50 private nothrow @trusted @nogc:
52 version(aliced) {} else alias usize = size_t;
54 //version = kiss_fft_use_parallel; // absolutely no reason to
57 // ////////////////////////////////////////////////////////////////////////// //
58 ///
59 public enum KissFFT { Forward, Inverse }
61 ///
62 //public alias kiss_fft_scalar = float;
63 //public alias kiss_fft_scalar = double;
65 ///
66 public template kiss_fft_scalar(T) if (is(T == float) || is(T == double) || is(T == real)) {
67 alias kiss_fft_scalar = T;
71 ///
72 public template isGoodKissFFTScalar(T) {
73 static if (is(T == float) || is(T == double) || is(T == real)) {
74 enum isGoodKissFFTScalar = true;
75 } else {
76 enum isGoodKissFFTScalar = false;
81 ///
82 public template isKissFFTComplex(T) {
83 static if (is(T : kiss_fft_cpx!S, S)) {
84 enum isKissFFTComplex = true;
85 } else {
86 enum isKissFFTComplex = false;
91 ///
92 public template KissFFTScalar(T) {
93 static if (is(T : kiss_fft_cpx!S, S)) {
94 alias KissFFTScalar = S;
95 } else {
96 static assert(0, "not a KissFFT complex type");
101 public align(1) struct kiss_fft_cpx(T) if (is(T == float) || is(T == double) || is(T == real)) {
102 align(1):
103 alias Scalar = T;
104 T r;
105 T i;
107 T opIndex (uint n) const pure nothrow @trusted @nogc { pragma(inline, true); return (n ? i : r); }
108 void opIndexAssign (in T v, uint n) nothrow @trusted @nogc { pragma(inline, true); if (n) i = v; else r = v; }
112 public alias kiss_fft_cpx_f = kiss_fft_cpx!float; ///
113 public alias kiss_fft_cpx_d = kiss_fft_cpx!double; ///
116 public alias kiss_fft_cfg_f = kiss_fft_state!float*; ///
117 public alias kiss_fft_cfg_d = kiss_fft_state!double*; ///
119 public template kiss_fft_cfg(T) if (is(T == float) || is(T == double) || is(T == real)) {
120 alias kiss_fft_cfg = kiss_fft_state!T*;
124 // ////////////////////////////////////////////////////////////////////////// //
125 /** Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
127 * typical usage: `kiss_fft_cfg mycfg = kiss_fft_alloc(1024, KissFFT.Forward);`
129 * The return value from fft_alloc is a cfg buffer used internally by the fft routine or `null`.
131 * If `lenmem` is `null`, then kiss_fft_alloc will allocate a cfg buffer using `malloc`.
132 * The returned value should be `kiss_fft_free()`d when done to avoid memory leaks.
134 * The state can be placed in a user supplied buffer `mem`:
135 * If `lenmem` is not `null` and `mem` is not `null` and `*lenmem` is large enough,
136 * then the function places the cfg in `mem` and the size used in `*lenmem`,
137 * and returns `mem`.
139 * If `lenmem` is not `null` and (`mem` is `null` or `*lenmem` is not large enough),
140 * then the function returns `null` and places the minimum cfg buffer size in `*lenmem`.
142 public kiss_fft_cfg!S kiss_fft_alloc(S) (uint nfft, KissFFT dir, void* mem=null, usize* lenmem=null)
143 if (is(S == float) || is(S == double) || is(S == real))
145 kiss_fft_cfg!S st = null;
146 usize memneeded = (kiss_fft_state!S).sizeof+(kiss_fft_cpx!S).sizeof*(nfft-1); // twiddle factors
148 if (lenmem is null) {
149 import core.stdc.stdlib : malloc;
150 st = cast(kiss_fft_cfg!S)malloc(memneeded);
151 } else {
152 if (mem !is null && *lenmem >= memneeded) st = cast(kiss_fft_cfg!S)mem;
153 *lenmem = memneeded;
156 if (st !is null) {
157 st.nfft = nfft;
158 st.inverse = (dir == KissFFT.Inverse);
159 foreach (immutable uint i; 0..nfft) {
160 enum pi = 3.141592653589793238462643383279502884197169399375105820974944;
161 immutable double phase = -2*pi*i/nfft*(st.inverse ? -1 : 1);
162 //if (st.inverse) phase = -phase;
163 mixin(kf_cexp!("st.twiddles.ptr+i", "phase"));
165 kf_factor!S(nfft, st.factors.ptr);
168 return st;
172 /** If kiss_fft_alloc allocated a buffer, it is one contiguous
173 * buffer and can be simply free()d when no longer needed
175 public void kiss_fft_free(S) (ref kiss_fft_cfg!S p)
176 if (is(S == float) || is(S == double) || is(S == real))
178 if (p !is null) {
179 import core.stdc.stdlib : free;
180 free(p);
181 p = null;
186 /** Perform an FFT on a complex input buffer.
188 * for a forward FFT,
189 * fin should be f[0] , f[1] , ... ,f[nfft-1]
190 * fout will be F[0] , F[1] , ... ,F[nfft-1]
191 * Note that each element is complex and can be accessed like f[k].r and f[k].i
193 public void kiss_fft(S) (kiss_fft_cfg!S cfg, const(kiss_fft_cpx!S)* fin, kiss_fft_cpx!S* fout)
194 if (is(S == float) || is(S == double) || is(S == real))
196 assert(cfg !is null);
197 kiss_fft_stride!S(cfg, fin, fout, 1);
201 /** A more generic version of the above function. It reads its input from every Nth sample. */
202 public void kiss_fft_stride(S) (kiss_fft_cfg!S st, const(kiss_fft_cpx!S)* fin, kiss_fft_cpx!S* fout, uint in_stride)
203 if (is(S == float) || is(S == double) || is(S == real))
205 import core.stdc.stdlib : alloca;
206 assert(st !is null);
207 if (fin is fout) {
208 import core.stdc.string : memcpy;
209 //NOTE: this is not really an in-place FFT algorithm.
210 //It just performs an out-of-place FFT into a temp buffer
211 //kiss_fft_cpx* tmpbuf = cast(kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(kiss_fft_cpx.sizeof*st.nfft);
212 kiss_fft_cpx!S* tmpbuf = cast(kiss_fft_cpx!S*)alloca((kiss_fft_cpx!S).sizeof*st.nfft);
213 kf_work!S(tmpbuf, fin, 1, in_stride, st.factors.ptr, st);
214 memcpy(fout, tmpbuf, (kiss_fft_cpx!S).sizeof*st.nfft);
215 //KISS_FFT_TMP_FREE(tmpbuf);
216 } else {
217 kf_work!S(fout, fin, 1, in_stride, st.factors.ptr, st);
222 /** Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5) */
223 public uint kiss_fft_next_fast_size (uint n) {
224 for (;;) {
225 uint m = n;
226 while ((m%2) == 0) m /= 2;
227 while ((m%3) == 0) m /= 3;
228 while ((m%5) == 0) m /= 5;
229 if (m <= 1) break; // n is completely factorable by twos, threes, and fives
230 ++n;
232 return n;
236 // ////////////////////////////////////////////////////////////////////////// //
237 // kissfftr
238 // Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
240 public alias kiss_fftr_cfg_f = kiss_fftr_state!float*; ///
241 public alias kiss_fftr_cfg_d = kiss_fftr_state!double*; ///
244 public template kiss_fftr_cfg(T) if (is(T == float) || is(T == double) || is(T == real)) {
245 alias kiss_fftr_cfg = kiss_fftr_state!T*;
248 struct kiss_fftr_state(S) if (is(S == float) || is(S == double) || is(S == real)) {
249 kiss_fft_cfg!S substate;
250 kiss_fft_cpx!S* tmpbuf;
251 kiss_fft_cpx!S* super_twiddles;
256 ** nfft must be even
258 * If you don't care to allocate space, use mem = lenmem = null
260 public kiss_fftr_cfg!S kiss_fftr_alloc(S) (uint nfft, KissFFT dir, void* mem, usize* lenmem)
261 if (is(S == float) || is(S == double) || is(S == real))
263 kiss_fftr_cfg!S st = null;
264 usize subsize, memneeded;
266 if (nfft&1) return null; //assert(0, "real FFT optimization must be even");
267 nfft >>= 1;
269 kiss_fft_alloc!S(nfft, dir, null, &subsize);
270 memneeded = (kiss_fftr_state!S).sizeof+subsize+(kiss_fft_cpx!S).sizeof*(nfft*3/2);
272 if (lenmem is null) {
273 import core.stdc.stdlib : malloc;
274 st = cast(kiss_fftr_cfg!S)malloc(memneeded);
275 } else {
276 if (*lenmem >= memneeded) st = cast(kiss_fftr_cfg!S)mem;
277 *lenmem = memneeded;
279 if (st is null) return null;
281 st.substate = cast(kiss_fft_cfg!S)(st+1); // just beyond kiss_fftr_state struct
282 st.tmpbuf = cast(kiss_fft_cpx!S*)((cast(ubyte*)st.substate)+subsize);
283 st.super_twiddles = st.tmpbuf+nfft;
284 kiss_fft_alloc!S(nfft, dir, st.substate, &subsize);
286 foreach (immutable i; 0..nfft/2) {
287 enum pi = 3.141592653589793238462643383279502884197169399375105820974944;
288 immutable double phase = -pi*(cast(double)(i+1)/nfft+0.5)*(dir == KissFFT.Inverse ? -1 : 1);
289 //if (dir == KissFFT.Inverse) phase = -phase;
290 mixin(kf_cexp!("st.super_twiddles+i", "phase"));
293 return st;
297 /** Use this to free `fftr` buffer. */
298 public void kiss_fft_free(S) (ref kiss_fftr_cfg!S p)
299 if (is(S == float) || is(S == double) || is(S == real))
301 if (p !is null) {
302 import core.stdc.stdlib : free;
303 free(p);
304 p = null;
309 /** input timedata has nfft scalar points
310 * output freqdata has nfft/2+1 complex points
312 public void kiss_fftr(S) (kiss_fftr_cfg!S st, const(kiss_fft_scalar!S)* timedata, kiss_fft_cpx!S* freqdata)
313 if (is(S == float) || is(S == double) || is(S == real))
315 // input buffer timedata is stored row-wise
316 uint k, ncfft;
317 kiss_fft_cpx!S fpnk, fpk, f1k, f2k, tw, tdc;
319 if (st.substate.inverse) assert(0, "kiss fft usage error: improper alloc");
321 ncfft = st.substate.nfft;
323 // perform the parallel fft of two real signals packed in real,imag
324 kiss_fft!S(st.substate, cast(const(kiss_fft_cpx!S)*)timedata, st.tmpbuf);
325 /* The real part of the DC element of the frequency spectrum in st.tmpbuf
326 * contains the sum of the even-numbered elements of the input time sequence
327 * The imag part is the sum of the odd-numbered elements
329 * The sum of tdc.r and tdc.i is the sum of the input time sequence.
330 * yielding DC of input time sequence
331 * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1...
332 * yielding Nyquist bin of input time sequence
335 tdc.r = st.tmpbuf[0].r;
336 tdc.i = st.tmpbuf[0].i;
337 freqdata[0].r = tdc.r+tdc.i;
338 freqdata[ncfft].r = tdc.r-tdc.i;
339 freqdata[ncfft].i = freqdata[0].i = 0;
341 for (k = 1; k <= ncfft/2; ++k) {
342 fpk = st.tmpbuf[k];
343 fpnk.r = st.tmpbuf[ncfft-k].r;
344 fpnk.i = -st.tmpbuf[ncfft-k].i;
346 mixin(C_ADD!("f1k", "fpk", "fpnk"));
347 mixin(C_SUB!("f2k", "fpk", "fpnk"));
348 mixin(C_MUL!("tw", "f2k", "st.super_twiddles[k-1]"));
350 freqdata[k].r = mixin(HALF_OF!"f1k.r+tw.r");
351 freqdata[k].i = mixin(HALF_OF!"f1k.i+tw.i");
352 freqdata[ncfft-k].r = mixin(HALF_OF!"f1k.r-tw.r");
353 freqdata[ncfft-k].i = mixin(HALF_OF!"tw.i-f1k.i");
358 /** input freqdata has nfft/2+1 complex points
359 * output timedata has nfft scalar points
361 public void kiss_fftri(S) (kiss_fftr_cfg!S st, const(kiss_fft_cpx!S)* freqdata, kiss_fft_scalar!S* timedata)
362 if (is(S == float) || is(S == double) || is(S == real))
364 // input buffer timedata is stored row-wise
365 if (st.substate.inverse == 0) assert(0, "kiss fft usage error: improper alloc");
367 uint ncfft = st.substate.nfft;
369 st.tmpbuf[0].r = freqdata[0].r+freqdata[ncfft].r;
370 st.tmpbuf[0].i = freqdata[0].r-freqdata[ncfft].r;
372 foreach (immutable uint k; 1..ncfft/2+1) {
373 kiss_fft_cpx!S fnkc = void, fek = void, fok = void, tmp = void;
374 kiss_fft_cpx!S fk = freqdata[k];
375 fnkc.r = freqdata[ncfft-k].r;
376 fnkc.i = -freqdata[ncfft-k].i;
377 mixin(C_ADD!("fek", "fk", "fnkc"));
378 mixin(C_SUB!("tmp", "fk", "fnkc"));
379 mixin(C_MUL!("fok", "tmp", "st.super_twiddles[k-1]"));
380 mixin(C_ADD!("st.tmpbuf[k]", "fek", "fok"));
381 mixin(C_SUB!("st.tmpbuf[ncfft-k]", "fek", "fok"));
382 st.tmpbuf[ncfft-k].i *= -1;
384 kiss_fft!S(st.substate, st.tmpbuf, cast(kiss_fft_cpx!S*)timedata);
388 /** for real ffts, we need an even size */
389 public uint kiss_fftr_next_fast_size_real (uint n) {
390 pragma(inline, true);
391 return kiss_fft_next_fast_size((n+1)>>1)<<1;
395 // ////////////////////////////////////////////////////////////////////////// //
396 enum MAXFACTORS = 32;
397 /* e.g. an fft of length 128 has 4 factors
398 as far as kissfft is concerned
399 4*4*4*2
402 struct kiss_fft_state(S) if (is(S == float) || is(S == double) || is(S == real)) {
403 uint nfft;
404 uint inverse;
405 uint[2*MAXFACTORS] factors;
406 kiss_fft_cpx!S[1] twiddles;
410 // ////////////////////////////////////////////////////////////////////////// //
411 private void kf_bfly2(S) (kiss_fft_cpx!S* Fout, in usize fstride, const(kiss_fft_cfg!S) st, int m) {
412 kiss_fft_cpx!S* Fout2;
413 const(kiss_fft_cpx!S)* tw1 = st.twiddles.ptr;
414 kiss_fft_cpx!S t;
415 Fout2 = Fout+m;
416 do {
417 mixin(C_MUL!("t", "*Fout2", "*tw1"));
418 tw1 += fstride;
419 mixin(C_SUB!("*Fout2", "*Fout", "t"));
420 mixin(C_ADDTO!("*Fout", "t"));
421 ++Fout2;
422 ++Fout;
423 } while (--m);
427 private void kf_bfly4(S) (kiss_fft_cpx!S* Fout, in usize fstride, const(kiss_fft_cfg!S) st, in usize m) {
428 const(kiss_fft_cpx!S)* tw1, tw2, tw3;
429 kiss_fft_cpx!S[6] scratch = void;
430 usize k = m;
431 immutable usize m2 = 2*m;
432 immutable usize m3 = 3*m;
433 tw3 = tw2 = tw1 = st.twiddles.ptr;
434 do {
435 mixin(C_MUL!("scratch[0]", "Fout[m]", "*tw1"));
436 mixin(C_MUL!("scratch[1]", "Fout[m2]", "*tw2"));
437 mixin(C_MUL!("scratch[2]", "Fout[m3]", "*tw3"));
439 mixin(C_SUB!("scratch[5]", "*Fout", "scratch[1]"));
440 mixin(C_ADDTO!("*Fout", "scratch[1]"));
441 mixin(C_ADD!("scratch[3]", "scratch[0]", "scratch[2]"));
442 mixin(C_SUB!("scratch[4]", "scratch[0]", "scratch[2]"));
443 mixin(C_SUB!("Fout[m2]", "*Fout", "scratch[3]"));
444 tw1 += fstride;
445 tw2 += fstride*2;
446 tw3 += fstride*3;
447 mixin(C_ADDTO!("*Fout", "scratch[3]"));
449 if (st.inverse) {
450 Fout[m].r = scratch[5].r-scratch[4].i;
451 Fout[m].i = scratch[5].i+scratch[4].r;
452 Fout[m3].r = scratch[5].r+scratch[4].i;
453 Fout[m3].i = scratch[5].i-scratch[4].r;
454 } else {
455 Fout[m].r = scratch[5].r+scratch[4].i;
456 Fout[m].i = scratch[5].i-scratch[4].r;
457 Fout[m3].r = scratch[5].r-scratch[4].i;
458 Fout[m3].i = scratch[5].i+scratch[4].r;
460 ++Fout;
461 } while (--k);
465 private void kf_bfly3(S) (kiss_fft_cpx!S* Fout, in usize fstride, const(kiss_fft_cfg!S) st, usize m) {
466 usize k = m;
467 immutable usize m2 = 2*m;
468 const(kiss_fft_cpx!S)* tw1, tw2;
469 kiss_fft_cpx!S[5] scratch = void;
470 kiss_fft_cpx!S epi3;
471 epi3 = st.twiddles[fstride*m];
472 tw1 = tw2 = st.twiddles.ptr;
473 do {
474 mixin(C_MUL!("scratch[1]", "Fout[m]", "*tw1"));
475 mixin(C_MUL!("scratch[2]", "Fout[m2]", "*tw2"));
477 mixin(C_ADD!("scratch[3]", "scratch[1]", "scratch[2]"));
478 mixin(C_SUB!("scratch[0]", "scratch[1]", "scratch[2]"));
479 tw1 += fstride;
480 tw2 += fstride*2;
482 Fout[m].r = Fout.r-mixin(HALF_OF!"scratch[3].r");
483 Fout[m].i = Fout.i-mixin(HALF_OF!"scratch[3].i");
485 mixin(C_MULBYSCALAR!("scratch[0]", "epi3.i"));
487 mixin(C_ADDTO!("*Fout", "scratch[3]"));
489 Fout[m2].r = Fout[m].r+scratch[0].i;
490 Fout[m2].i = Fout[m].i-scratch[0].r;
492 Fout[m].r -= scratch[0].i;
493 Fout[m].i += scratch[0].r;
495 ++Fout;
496 } while (--k);
500 private void kf_bfly5(S) (kiss_fft_cpx!S* Fout, in usize fstride, const(kiss_fft_cfg!S) st, uint m) {
501 kiss_fft_cpx!S* Fout0, Fout1, Fout2, Fout3, Fout4;
502 kiss_fft_cpx!S[13] scratch = void;
503 const(kiss_fft_cpx!S)* twiddles = st.twiddles.ptr;
504 const(kiss_fft_cpx!S)* tw;
505 kiss_fft_cpx!S ya = twiddles[fstride*m];
506 kiss_fft_cpx!S yb = twiddles[fstride*2*m];
508 Fout0 = Fout;
509 Fout1 = Fout0+m;
510 Fout2 = Fout0+2*m;
511 Fout3 = Fout0+3*m;
512 Fout4 = Fout0+4*m;
514 tw = st.twiddles.ptr;
515 foreach (immutable uint u; 0..m) {
516 scratch[0] = *Fout0;
518 mixin(C_MUL!("scratch[1]", "*Fout1", "tw[u*fstride]"));
519 mixin(C_MUL!("scratch[2]", "*Fout2", "tw[2*u*fstride]"));
520 mixin(C_MUL!("scratch[3]", "*Fout3", "tw[3*u*fstride]"));
521 mixin(C_MUL!("scratch[4]", "*Fout4", "tw[4*u*fstride]"));
523 mixin(C_ADD!("scratch[7]", "scratch[1]", "scratch[4]"));
524 mixin(C_SUB!("scratch[10]", "scratch[1]", "scratch[4]"));
525 mixin(C_ADD!("scratch[8]", "scratch[2]", "scratch[3]"));
526 mixin(C_SUB!("scratch[9]", "scratch[2]", "scratch[3]"));
528 Fout0.r += scratch[7].r+scratch[8].r;
529 Fout0.i += scratch[7].i+scratch[8].i;
531 scratch[5].r = scratch[0].r+scratch[7].r*ya.r+scratch[8].r*yb.r;
532 scratch[5].i = scratch[0].i+scratch[7].i*ya.r+scratch[8].i*yb.r;
534 scratch[6].r = scratch[10].i*ya.i+scratch[9].i*yb.i;
535 scratch[6].i = -scratch[10].r*ya.i-scratch[9].r*yb.i;
537 mixin(C_SUB!("*Fout1", "scratch[5]", "scratch[6]"));
538 mixin(C_ADD!("*Fout4", "scratch[5]", "scratch[6]"));
540 scratch[11].r = scratch[0].r+scratch[7].r*yb.r+scratch[8].r*ya.r;
541 scratch[11].i = scratch[0].i+scratch[7].i*yb.r+scratch[8].i*ya.r;
542 scratch[12].r = -scratch[10].i*yb.i+scratch[9].i*ya.i;
543 scratch[12].i = scratch[10].r*yb.i-scratch[9].r*ya.i;
545 mixin(C_ADD!("*Fout2", "scratch[11]", "scratch[12]"));
546 mixin(C_SUB!("*Fout3", "scratch[11]", "scratch[12]"));
548 ++Fout0;
549 ++Fout1;
550 ++Fout2;
551 ++Fout3;
552 ++Fout4;
556 // perform the butterfly for one stage of a mixed radix FFT
557 private void kf_bfly_generic(S) (kiss_fft_cpx!S* Fout, in usize fstride, const(kiss_fft_cfg!S) st, uint m, uint p) {
558 import core.stdc.stdlib : alloca;
560 //uint q1, q;
561 const(kiss_fft_cpx!S)* twiddles = st.twiddles.ptr;
562 kiss_fft_cpx!S t;
563 uint Norig = st.nfft;
565 //kiss_fft_cpx* scratch = cast(kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(kiss_fft_cpx.sizeof*p);
566 kiss_fft_cpx!S* scratch = cast(kiss_fft_cpx!S*)alloca((kiss_fft_cpx!S).sizeof*p);
568 foreach (immutable uint u; 0..m) {
569 uint k = u;
570 foreach (immutable uint q1; 0..p) {
571 scratch[q1] = Fout[k];
572 k += m;
575 k = u;
576 foreach (immutable uint q1; 0..p) {
577 uint twidx = 0;
578 Fout[k] = scratch[0];
579 foreach (immutable uint q; 1..p) {
580 twidx += fstride*k;
581 if (twidx >= Norig) twidx -= Norig;
582 mixin(C_MUL!("t", "scratch[q]", "twiddles[twidx]"));
583 mixin(C_ADDTO!("Fout[k]", "t"));
585 k += m;
588 //KISS_FFT_TMP_FREE(scratch);
592 private void kf_work(S) (kiss_fft_cpx!S* Fout, const(kiss_fft_cpx!S)* f, in usize fstride, uint in_stride, uint* factors, const(kiss_fft_cfg!S) st) {
593 kiss_fft_cpx!S* Fout_beg = Fout;
594 immutable uint p = *factors++; // the radix
595 immutable uint m = *factors++; // stage's fft length/p
596 const(kiss_fft_cpx!S)* Fout_end = Fout+p*m;
598 version(kiss_fft_use_parallel) {
599 // use threads at the top-level (not recursive)
600 import std.parallelism;
601 import std.range : iota;
602 if (fstride == 1 && p <= 5) {
603 // execute the p different work units in different threads
604 foreach (uint k; parallel(iota(p))) {
605 kf_work!S(Fout+k*m, f+fstride*in_stride*k, fstride*p, in_stride, factors, st);
607 // all threads have joined by this point
608 switch (p) {
609 case 2: kf_bfly2!S(Fout, fstride, st, m); break;
610 case 3: kf_bfly3!S(Fout, fstride, st, m); break;
611 case 4: kf_bfly4!S(Fout, fstride, st, m); break;
612 case 5: kf_bfly5!S(Fout, fstride, st, m); break;
613 default: kf_bfly_generic!S(Fout, fstride, st, m, p); break;
615 return;
619 if (m == 1) {
620 do {
621 *Fout = *f;
622 f += fstride*in_stride;
623 } while (++Fout !is Fout_end);
624 } else {
625 do {
626 // recursive call:
627 // DFT of size m*p performed by doing
628 // p instances of smaller DFTs of size m,
629 // each one takes a decimated version of the input
630 kf_work!S(Fout, f, fstride*p, in_stride, factors, st);
631 f += fstride*in_stride;
632 } while ((Fout += m) !is Fout_end);
635 Fout = Fout_beg;
637 // recombine the p smaller DFTs
638 switch (p) {
639 case 2: kf_bfly2!S(Fout, fstride, st, m); break;
640 case 3: kf_bfly3!S(Fout, fstride, st, m); break;
641 case 4: kf_bfly4!S(Fout, fstride, st, m); break;
642 case 5: kf_bfly5!S(Fout, fstride, st, m); break;
643 default: kf_bfly_generic!S(Fout, fstride, st, m, p); break;
648 /* facbuf is populated by p1, m1, p2, m2, ...
649 * where
650 * p[i]*m[i] = m[i-1]
651 * m0 = n
653 private void kf_factor(S) (uint n, uint* facbuf) {
654 import std.math : floor, sqrt;
655 immutable double floor_sqrt = floor(sqrt(cast(double)n));
656 uint p = 4;
657 // factor out powers of 4, powers of 2, then any remaining primes
658 do {
659 while (n%p) {
660 switch (p) {
661 case 4: p = 2; break;
662 case 2: p = 3; break;
663 default: p += 2; break;
665 if (p > floor_sqrt) p = n; // no more factors, skip to end
667 n /= p;
668 *facbuf++ = p;
669 *facbuf++ = n;
670 } while (n > 1);
674 // ////////////////////////////////////////////////////////////////////////// //
675 // kissfft_guts
677 Explanation of macros dealing with complex math:
679 C_MUL(m,a,b) : m = a*b
680 C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise
681 C_SUB( res, a,b) : res = a-b
682 C_SUBFROM( res , a) : res -= a
683 C_ADDTO( res , a) : res += a
685 //enum S_MUL(string a, string b) = "(("~a~")*("~b~"))";
686 enum C_MUL(string m, string a, string b) =
687 "{ ("~m~").r = ("~a~").r*("~b~").r-("~a~").i*("~b~").i;
688 ("~m~").i = ("~a~").r*("~b~").i+("~a~").i*("~b~").r; }";
689 enum C_MULBYSCALAR(string c, string s) = " { ("~c~").r *= ("~s~"); ("~c~").i *= ("~s~"); }";
691 enum C_ADD(string res, string a, string b) = "{ ("~res~").r=("~a~").r+("~b~").r; ("~res~").i=("~a~").i+("~b~").i; }";
692 enum C_SUB(string res, string a, string b) = "{ ("~res~").r=("~a~").r-("~b~").r; ("~res~").i=("~a~").i-("~b~").i; }";
693 enum C_ADDTO(string res, string a) = "{ ("~res~").r += ("~a~").r; ("~res~").i += ("~a~").i; }";
694 enum C_SUBFROM(string res, string a) = "{ ("~res~").r -= ("~a~").r; ("~res~").i -= ("~a~").i; }";
697 //kiss_fft_scalar KISS_FFT_COS(T) (T phase) { import std.math : cos; return cos(phase); }
698 //kiss_fft_scalar KISS_FFT_SIN(T) (T phase) { import std.math : sin; return sin(phase); }
699 //kiss_fft_scalar HALF_OF (kiss_fft_scalar x) { return x*cast(kiss_fft_scalar)0.5; }
700 enum HALF_OF(string x) = "(cast(kiss_fft_scalar!S)(("~x~")*cast(kiss_fft_scalar!S)0.5))";
702 //enum kf_cexp(string x, string phase) = "{ ("~x~").r = KISS_FFT_COS("~phase~"); ("~x~").i = KISS_FFT_SIN("~phase~"); }";
703 enum kf_cexp(string x, string phase) = "{ import std.math : cos, sin; ("~x~").r = cos("~phase~"); ("~x~").i = sin("~phase~"); }";
707 #ifdef KISS_FFT_USE_ALLOCA
708 // define this to allow use of alloca instead of malloc for temporary buffers
709 // Temporary buffers are used in two case:
710 // 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
711 // 2. "in-place" FFTs. Notice the quotes, since kissfft does not really do an in-place transform.
712 #include <alloca.h>
713 #define KISS_FFT_TMP_ALLOC(nbytes) alloca(nbytes)
714 #define KISS_FFT_TMP_FREE(ptr)
715 #else
716 #define KISS_FFT_TMP_ALLOC(nbytes) KISS_FFT_MALLOC(nbytes)
717 #define KISS_FFT_TMP_FREE(ptr) KISS_FFT_FREE(ptr)
718 #endif
722 // ////////////////////////////////////////////////////////////////////////// //
724 public alias kiss_fftnd_cfg_f = kiss_fftnd_state!float*; ///
725 public alias kiss_fftnd_cfg_d = kiss_fftnd_state!double*; ///
727 public template kiss_fftnd_cfg(T) if (is(T == float) || is(T == double) || is(T == real)) {
728 alias kiss_fftnd_cfg = kiss_fftnd_state!T*;
731 struct kiss_fftnd_state(S)
732 if (is(S == float) || is(S == double) || is(S == real))
734 uint dimprod; /* dimsum would be mighty tasty right now */
735 uint ndims;
736 uint* dims;
737 kiss_fft_cfg!S* states; /* cfg states for each dimension */
738 kiss_fft_cpx!S* tmpbuf; /* buffer capable of hold the entire input */
743 public kiss_fftnd_cfg!S kiss_fftnd_alloc(S) (const(uint)[] dims, KissFFT dir, void* mem=null, usize* lenmem=null)
744 if (is(S == float) || is(S == double) || is(S == real))
746 import core.stdc.stdlib : malloc;
748 if (dims.length < 1 || dims.length > ushort.max) return null;
749 immutable uint ndims = cast(uint)dims.length;
750 kiss_fftnd_cfg!S st = null;
751 int dimprod = 1;
752 usize memneeded = (kiss_fftnd_state!S).sizeof;
753 ubyte* ptr;
755 foreach (immutable uint i; 0..ndims) {
756 usize sublen = 0;
757 kiss_fft_alloc!S(dims[i], dir, null, &sublen);
758 memneeded += sublen; /* st.states[i] */
759 dimprod *= dims[i];
761 memneeded += int.sizeof*ndims;/* st.dims */
762 memneeded += (void*).sizeof*ndims;/* st.states */
763 memneeded += (kiss_fft_cpx!S).sizeof*dimprod; /* st.tmpbuf */
765 if (lenmem is null) {
766 /* allocate for the caller */
767 st = cast(kiss_fftnd_cfg!S)malloc(memneeded);
768 } else {
769 /* initialize supplied buffer if big enough */
770 if (*lenmem >= memneeded) st = cast(kiss_fftnd_cfg!S)mem;
771 *lenmem = memneeded; /* tell caller how big struct is (or would be) */
773 if (st is null) return null; /* malloc failed or buffer too small */
775 st.dimprod = dimprod;
776 st.ndims = ndims;
777 ptr = cast(ubyte*)(st+1);
779 st.states = cast(kiss_fft_cfg!S*)ptr;
780 ptr += (void*).sizeof*ndims;
782 st.dims = cast(uint*)ptr;
783 ptr += int.sizeof*ndims;
785 st.tmpbuf = cast(kiss_fft_cpx!S*)ptr;
786 ptr += (kiss_fft_cpx!S).sizeof*dimprod;
788 foreach (immutable uint i; 0..ndims) {
789 usize len;
790 st.dims[i] = dims[i];
791 kiss_fft_alloc!S(st.dims[i], dir, null, &len);
792 st.states[i] = kiss_fft_alloc!S(st.dims[i], dir, ptr, &len);
793 ptr += len;
796 return st;
801 public void kiss_fftnd_free(S) (ref kiss_fftnd_cfg!S p)
802 if (is(S == float) || is(S == double) || is(S == real))
804 if (p !is null) {
805 import core.stdc.stdlib : free;
806 free(p);
807 p = null;
813 This works by tackling one dimension at a time.
815 In effect,
816 Each stage starts out by reshaping the matrix into a DixSi 2d matrix.
817 A Di-sized fft is taken of each column, transposing the matrix as it goes.
819 Here's a 3-d example:
820 Take a 2x3x4 matrix, laid out in memory as a contiguous buffer
821 [ [ [ a b c d ] [ e f g h ] [ i j k l ] ]
822 [ [ m n o p ] [ q r s t ] [ u v w x ] ] ]
824 Stage 0 ( D=2): treat the buffer as a 2x12 matrix
825 [ [a b ... k l]
826 [m n ... w x] ]
828 FFT each column with size 2.
829 Transpose the matrix at the same time using kiss_fft_stride.
831 [ [ a+m a-m ]
832 [ b+n b-n]
834 [ k+w k-w ]
835 [ l+x l-x ] ]
837 Note fft([x y]) == [x+y x-y]
839 Stage 1 ( D=3) treats the buffer (the output of stage D=2) as an 3x8 matrix,
840 [ [ a+m a-m b+n b-n c+o c-o d+p d-p ]
841 [ e+q e-q f+r f-r g+s g-s h+t h-t ]
842 [ i+u i-u j+v j-v k+w k-w l+x l-x ] ]
844 And perform FFTs (size=3) on each of the columns as above, transposing
845 the matrix as it goes. The output of stage 1 is
846 (Legend: ap = [ a+m e+q i+u ]
847 am = [ a-m e-q i-u ] )
849 [ [ sum(ap) fft(ap)[0] fft(ap)[1] ]
850 [ sum(am) fft(am)[0] fft(am)[1] ]
851 [ sum(bp) fft(bp)[0] fft(bp)[1] ]
852 [ sum(bm) fft(bm)[0] fft(bm)[1] ]
853 [ sum(cp) fft(cp)[0] fft(cp)[1] ]
854 [ sum(cm) fft(cm)[0] fft(cm)[1] ]
855 [ sum(dp) fft(dp)[0] fft(dp)[1] ]
856 [ sum(dm) fft(dm)[0] fft(dm)[1] ] ]
858 Stage 2 ( D=4) treats this buffer as a 4*6 matrix,
859 [ [ sum(ap) fft(ap)[0] fft(ap)[1] sum(am) fft(am)[0] fft(am)[1] ]
860 [ sum(bp) fft(bp)[0] fft(bp)[1] sum(bm) fft(bm)[0] fft(bm)[1] ]
861 [ sum(cp) fft(cp)[0] fft(cp)[1] sum(cm) fft(cm)[0] fft(cm)[1] ]
862 [ sum(dp) fft(dp)[0] fft(dp)[1] sum(dm) fft(dm)[0] fft(dm)[1] ] ]
864 Then FFTs each column, transposing as it goes.
866 The resulting matrix is the 3d FFT of the 2x3x4 input matrix.
868 Note as a sanity check that the first element of the final
869 stage's output (DC term) is
870 sum( [ sum(ap) sum(bp) sum(cp) sum(dp) ] )
871 , i.e. the summation of all 24 input elements.
874 public void kiss_fftnd(S) (kiss_fftnd_cfg!S st, const(kiss_fft_cpx!S)* fin, kiss_fft_cpx!S* fout)
875 if (is(S == float) || is(S == double) || is(S == real))
877 import core.stdc.string : memcpy;
879 const(kiss_fft_cpx!S)* bufin = fin;
880 kiss_fft_cpx!S* bufout;
882 /* arrange it so the last bufout == fout */
883 if (st.ndims&1) {
884 bufout = fout;
885 if (fin is fout) {
886 memcpy(st.tmpbuf, fin, (kiss_fft_cpx!S).sizeof*st.dimprod);
887 bufin = st.tmpbuf;
889 } else {
890 bufout = st.tmpbuf;
893 foreach (immutable uint k; 0..st.ndims) {
894 uint curdim = st.dims[k];
895 uint stride = st.dimprod/curdim;
897 foreach (immutable uint i; 0..stride) {
898 kiss_fft_stride!S(st.states[k], bufin+i, bufout+i*curdim, stride);
901 /* toggle back and forth between the two buffers */
902 if (bufout is st.tmpbuf) {
903 bufout = fout;
904 bufin = st.tmpbuf;
905 } else {
906 bufout = st.tmpbuf;
907 bufin = fout;
913 // ////////////////////////////////////////////////////////////////////////// //
915 public alias kiss_fftndr_cfg_f = kiss_fftndr_state!float*; ///
916 public alias kiss_fftndr_cfg_d = kiss_fftndr_state!double*; ///
918 public template kiss_fftndr_cfg(T) if (is(T == float) || is(T == double) || is(T == real)) {
919 alias kiss_fftndr_cfg = kiss_fftndr_state!T*;
922 struct kiss_fftndr_state(S)
923 if (is(S == float) || is(S == double) || is(S == real))
925 uint dimReal;
926 uint dimOther;
927 kiss_fftr_cfg!S cfg_r;
928 kiss_fftnd_cfg!S cfg_nd;
929 ubyte* tmpbuf;
933 int prod (const(uint)[] dims) pure nothrow @trusted @nogc {
934 uint x = 1;
935 uint ndims = cast(uint)dims.length;
936 const(uint)* dp = dims.ptr;
937 while (ndims--) x *= *dp++;
938 return x;
941 T MAX(T) (in T a, in T b) pure nothrow @safe @nogc { pragma(inline, true); return (a > b ? a : b); }
945 dims[0] must be even
947 If you don't care to allocate space, use mem = lenmem = null
949 public kiss_fftndr_cfg!S kiss_fftndr_alloc(S) (const(uint)[] dims, KissFFT dir, void* mem=null, usize* lenmem=null)
950 if (is(S == float) || is(S == double) || is(S == real))
952 import core.stdc.stdlib : malloc, free;
953 import core.stdc.string : memset;
955 if (dims.length < 1 || dims.length > ushort.max) return null;
956 immutable uint ndims = cast(uint)dims.length;
958 kiss_fftndr_cfg!S st = null;
959 usize nr = 0, nd = 0, ntmp = 0;
960 uint dimReal = dims[ndims-1];
961 uint dimOther = prod(dims[0..$-1]);
962 usize memneeded;
964 kiss_fftr_alloc!S(dimReal, dir, null, &nr);
965 kiss_fftnd_alloc!S(dims[0..$-1], dir, null, &nd);
966 ntmp =
967 MAX(2*dimOther, dimReal+2)*(kiss_fft_scalar!S).sizeof+ // freq buffer for one pass
968 dimOther*(dimReal+2)*(kiss_fft_scalar!S).sizeof; // large enough to hold entire input in case of in-place
970 memneeded = (kiss_fftndr_state!S).sizeof+nr+nd+ntmp;
972 bool malloced = false;
973 if (lenmem is null) {
974 st = cast(kiss_fftndr_cfg!S)malloc(memneeded);
975 malloced = true;
976 } else {
977 if (*lenmem >= memneeded) st = cast(kiss_fftndr_cfg!S)mem;
978 *lenmem = memneeded;
980 if (st is null) return null;
982 memset(st, 0, memneeded);
984 st.dimReal = dimReal;
985 st.dimOther = dimOther;
986 st.cfg_r = kiss_fftr_alloc!S(dimReal, dir, st+1, &nr);
987 if (st.cfg_r is null) { if (malloced) free(st); return null; }
988 st.cfg_nd = kiss_fftnd_alloc!S(dims[0..$-1], dir, (cast(ubyte*)st.cfg_r)+nr, &nd);
989 if (st.cfg_nd is null) { if (malloced) free(st); return null; }
990 st.tmpbuf = cast(ubyte*)st.cfg_nd+nd;
991 assert(st.tmpbuf !is null);
993 return st;
998 public void kiss_fftndr_free(S) (ref kiss_fftndr_cfg!S p)
999 if (is(S == float) || is(S == double) || is(S == real))
1001 if (p !is null) {
1002 import core.stdc.stdlib : free;
1003 free(p);
1004 p = null;
1010 input timedata has dims[0] X dims[1] X ... X dims[ndims-1] scalar points
1011 output freqdata has dims[0] X dims[1] X ... X dims[ndims-1]/2+1 complex points
1013 public void kiss_fftndr(S) (kiss_fftndr_cfg!S st, const(kiss_fft_scalar!S)* timedata, kiss_fft_cpx!S* freqdata)
1014 if (is(S == float) || is(S == double) || is(S == real))
1016 uint dimReal = st.dimReal;
1017 uint dimOther = st.dimOther;
1018 uint nrbins = dimReal/2+1;
1020 kiss_fft_cpx!S* tmp1 = cast(kiss_fft_cpx!S*)st.tmpbuf;
1021 kiss_fft_cpx!S* tmp2 = tmp1+MAX(nrbins, dimOther);
1022 //assert(tmp1 !is null);
1024 // timedata is N0 x N1 x ... x Nk real
1026 // take a real chunk of data, fft it and place the output at correct intervals
1027 foreach (immutable uint k1; 0..dimOther) {
1028 kiss_fftr!S(st.cfg_r, timedata+k1*dimReal, tmp1); // tmp1 now holds nrbins complex points
1029 foreach (immutable uint k2; 0..nrbins) tmp2[k2*dimOther+k1] = tmp1[k2];
1032 foreach (immutable uint k2; 0..nrbins) {
1033 kiss_fftnd!S(st.cfg_nd, tmp2+k2*dimOther, tmp1); // tmp1 now holds dimOther complex points
1034 foreach (immutable uint k1; 0..dimOther) freqdata[k1*nrbins+k2] = tmp1[k1];
1040 input and output dimensions are the exact opposite of kiss_fftndr
1042 public void kiss_fftndri(S) (kiss_fftndr_cfg!S st, const(kiss_fft_cpx!S)* freqdata, kiss_fft_scalar!S* timedata)
1043 if (is(S == float) || is(S == double) || is(S == real))
1045 int dimReal = st.dimReal;
1046 int dimOther = st.dimOther;
1047 int nrbins = dimReal/2+1;
1048 kiss_fft_cpx!S* tmp1 = cast(kiss_fft_cpx!S*)st.tmpbuf;
1049 kiss_fft_cpx!S* tmp2 = tmp1+MAX(nrbins, dimOther);
1051 foreach (immutable uint k2; 0..nrbins) {
1052 foreach (immutable uint k1; 0..dimOther) tmp1[k1] = freqdata[k1*(nrbins)+k2];
1053 kiss_fftnd!S(st.cfg_nd, tmp1, tmp2+k2*dimOther);
1056 foreach (immutable uint k1; 0..dimOther) {
1057 foreach (immutable uint k2; 0..nrbins) tmp1[k2] = tmp2[k2*dimOther+k1];
1058 kiss_fftri!S(st.cfg_r, tmp1, timedata+k1*dimReal);