1 # Lossy compression algorithms very often make use of DCT or DFT calculations,
2 # or variations of these calculations. This file is intended to be a short
3 # reference about classical DCT and DFT algorithms.
6 from random
import random
7 from math
import pi
, sin
, cos
, sqrt
11 return exp (alpha
* 1j
)
15 return c
.real
- 1j
* c
.imag
21 # Let us start withthe canonical definition of the unscaled DFT algorithm :
22 # (I can not draw sigmas in a text file so I'll use python code instead) :)
25 return exp_j ((-2*pi
*k
)/N
)
27 def unscaled_DFT (N
, input, output
):
28 for o
in range(N
): # o is output index
31 output
[o
] = output
[o
] + input[i
] * W (i
*o
, N
)
33 # This algorithm takes complex input and output. There are N*N complex
34 # multiplications and N*(N-1) complex additions.
37 # Of course this algorithm is an extremely naive implementation and there are
38 # some ways to use the trigonometric properties of the coefficients to find
39 # some decompositions that can accelerate the calculation by several orders
40 # of magnitude... This is a well known and studied problem. One of the
41 # available explanations of this process is at this url :
42 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html
45 # Let's start with the radix-2 decimation-in-time algorithm :
47 def unscaled_DFT_radix2_time (N
, input, output
):
48 even_input
= vector(N
/2)
49 odd_input
= vector(N
/2)
50 even_output
= vector(N
/2)
51 odd_output
= vector(N
/2)
54 even_input
[i
] = input[2*i
]
55 odd_input
[i
] = input[2*i
+1]
57 unscaled_DFT (N
/2, even_input
, even_output
)
58 unscaled_DFT (N
/2, odd_input
, odd_output
)
61 odd_output
[i
] = odd_output
[i
] * W (i
, N
)
64 output
[i
] = even_output
[i
] + odd_output
[i
]
65 output
[i
+N
/2] = even_output
[i
] - odd_output
[i
]
67 # This algorithm takes complex input and output.
69 # We divide the DFT calculation into 2 DFT calculations of size N/2
70 # We then do N/2 complex multiplies followed by N complex additions.
71 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
72 # multiplies... we will skip 1 for i=0 and 1 for i=N/4. Also for i=N/8 and for
73 # i=3*N/8 the W(i,N) values can be special-cased to implement the complex
74 # multiplication using only 2 real additions and 2 real multiplies)
76 # Also note that all the basic stages of this DFT algorithm are easily
77 # reversible, so we can calculate the IDFT with the same complexity.
80 # A variant of this is the radix-2 decimation-in-frequency algorithm :
82 def unscaled_DFT_radix2_freq (N
, input, output
):
83 even_input
= vector(N
/2)
84 odd_input
= vector(N
/2)
85 even_output
= vector(N
/2)
86 odd_output
= vector(N
/2)
89 even_input
[i
] = input[i
] + input[i
+N
/2]
90 odd_input
[i
] = input[i
] - input[i
+N
/2]
93 odd_input
[i
] = odd_input
[i
] * W (i
, N
)
95 unscaled_DFT (N
/2, even_input
, even_output
)
96 unscaled_DFT (N
/2, odd_input
, odd_output
)
99 output
[2*i
] = even_output
[i
]
100 output
[2*i
+1] = odd_output
[i
]
102 # Note that the decimation-in-time and the decimation-in-frequency varients
103 # have exactly the same complexity, they only do the operations in a different
106 # Actually, if you look at the decimation-in-time variant of the DFT, and
107 # reverse it to calculate an IDFT, you get something that is extremely close
108 # to the decimation-in-frequency DFT algorithm...
111 # The radix-4 algorithms are slightly more efficient : they take into account
112 # the fact that with complex numbers, multiplications by j and -j are also
113 # "free"... i.e. when you code them using real numbers, they translate into
114 # a few data moves but no real operation.
116 # Let's start with the radix-4 decimation-in-time algorithm :
118 def unscaled_DFT_radix4_time (N
, input, output
):
119 input_0
= vector(N
/4)
120 input_1
= vector(N
/4)
121 input_2
= vector(N
/4)
122 input_3
= vector(N
/4)
123 output_0
= vector(N
/4)
124 output_1
= vector(N
/4)
125 output_2
= vector(N
/4)
126 output_3
= vector(N
/4)
133 input_0
[i
] = input[4*i
]
134 input_1
[i
] = input[4*i
+1]
135 input_2
[i
] = input[4*i
+2]
136 input_3
[i
] = input[4*i
+3]
138 unscaled_DFT (N
/4, input_0
, output_0
)
139 unscaled_DFT (N
/4, input_1
, output_1
)
140 unscaled_DFT (N
/4, input_2
, output_2
)
141 unscaled_DFT (N
/4, input_3
, output_3
)
144 output_1
[i
] = output_1
[i
] * W (i
, N
)
145 output_2
[i
] = output_2
[i
] * W (2*i
, N
)
146 output_3
[i
] = output_3
[i
] * W (3*i
, N
)
149 tmp_0
[i
] = output_0
[i
] + output_2
[i
]
150 tmp_1
[i
] = output_0
[i
] - output_2
[i
]
151 tmp_2
[i
] = output_1
[i
] + output_3
[i
]
152 tmp_3
[i
] = output_1
[i
] - output_3
[i
]
155 output
[i
] = tmp_0
[i
] + tmp_2
[i
]
156 output
[i
+N
/4] = tmp_1
[i
] - 1j
* tmp_3
[i
]
157 output
[i
+N
/2] = tmp_0
[i
] - tmp_2
[i
]
158 output
[i
+3*N
/4] = tmp_1
[i
] + 1j
* tmp_3
[i
]
160 # This algorithm takes complex input and output.
162 # We divide the DFT calculation into 4 DFT calculations of size N/4
163 # We then do 3*N/4 complex multiplies followed by 2*N complex additions.
164 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
165 # multiplies... we will skip 3 for i=0 and 1 for i=N/8. Also for i=N/8
166 # the remaining W(i,N) and W(3*i,N) multiplies can be implemented using only
167 # 2 real additions and 2 real multiplies. For i=N/16 and i=3*N/16 we can also
168 # optimise the W(2*i/N) multiply this way.
170 # If we wanted to do the same decomposition with one radix-2 decomposition
171 # of size N and 2 radix-2 decompositions of size N/2, the total cost would be
172 # N complex multiplies and 2*N complex additions. Thus we see that the
173 # decomposition of one DFT calculation of size N into 4 calculations of size
174 # N/4 using the radix-4 algorithm instead of the radix-2 algorithm saved N/4
175 # complex multiplies...
178 # The radix-4 decimation-in-frequency algorithm is similar :
180 def unscaled_DFT_radix4_freq (N
, input, output
):
181 input_0
= vector(N
/4)
182 input_1
= vector(N
/4)
183 input_2
= vector(N
/4)
184 input_3
= vector(N
/4)
185 output_0
= vector(N
/4)
186 output_1
= vector(N
/4)
187 output_2
= vector(N
/4)
188 output_3
= vector(N
/4)
195 tmp_0
[i
] = input[i
] + input[i
+N
/2]
196 tmp_1
[i
] = input[i
+N
/4] + input[i
+3*N
/4]
197 tmp_2
[i
] = input[i
] - input[i
+N
/2]
198 tmp_3
[i
] = input[i
+N
/4] - input[i
+3*N
/4]
201 input_0
[i
] = tmp_0
[i
] + tmp_1
[i
]
202 input_1
[i
] = tmp_2
[i
] - 1j
* tmp_3
[i
]
203 input_2
[i
] = tmp_0
[i
] - tmp_1
[i
]
204 input_3
[i
] = tmp_2
[i
] + 1j
* tmp_3
[i
]
207 input_1
[i
] = input_1
[i
] * W (i
, N
)
208 input_2
[i
] = input_2
[i
] * W (2*i
, N
)
209 input_3
[i
] = input_3
[i
] * W (3*i
, N
)
211 unscaled_DFT (N
/4, input_0
, output_0
)
212 unscaled_DFT (N
/4, input_1
, output_1
)
213 unscaled_DFT (N
/4, input_2
, output_2
)
214 unscaled_DFT (N
/4, input_3
, output_3
)
217 output
[4*i
] = output_0
[i
]
218 output
[4*i
+1] = output_1
[i
]
219 output
[4*i
+2] = output_2
[i
]
220 output
[4*i
+3] = output_3
[i
]
222 # Once again the complexity is exactly the same as for the radix-4
223 # decimation-in-time DFT algorithm, only the order of the operations is
227 # Now let us reorder the radix-4 algorithms in a different way :
229 #def unscaled_DFT_radix4_time (N, input, output):
230 # input_0 = vector(N/4)
231 # input_1 = vector(N/4)
232 # input_2 = vector(N/4)
233 # input_3 = vector(N/4)
234 # output_0 = vector(N/4)
235 # output_1 = vector(N/4)
236 # output_2 = vector(N/4)
237 # output_3 = vector(N/4)
238 # tmp_0 = vector(N/4)
239 # tmp_1 = vector(N/4)
240 # tmp_2 = vector(N/4)
241 # tmp_3 = vector(N/4)
243 # for i in range(N/4):
244 # input_0[i] = input[4*i]
245 # input_2[i] = input[4*i+2]
247 # unscaled_DFT (N/4, input_0, output_0)
248 # unscaled_DFT (N/4, input_2, output_2)
250 # for i in range(N/4):
251 # output_2[i] = output_2[i] * W (2*i, N)
253 # for i in range(N/4):
254 # tmp_0[i] = output_0[i] + output_2[i]
255 # tmp_1[i] = output_0[i] - output_2[i]
257 # for i in range(N/4):
258 # input_1[i] = input[4*i+1]
259 # input_3[i] = input[4*i+3]
261 # unscaled_DFT (N/4, input_1, output_1)
262 # unscaled_DFT (N/4, input_3, output_3)
264 # for i in range(N/4):
265 # output_1[i] = output_1[i] * W (i, N)
266 # output_3[i] = output_3[i] * W (3*i, N)
268 # for i in range(N/4):
269 # tmp_2[i] = output_1[i] + output_3[i]
270 # tmp_3[i] = output_1[i] - output_3[i]
272 # for i in range(N/4):
273 # output[i] = tmp_0[i] + tmp_2[i]
274 # output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
275 # output[i+N/2] = tmp_0[i] - tmp_2[i]
276 # output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
278 # We didn't do anything here, only reorder the operations. But now, look at the
279 # first part of this function, up to the calculations of tmp0 and tmp1 : this
280 # is extremely similar to the radix-2 decimation-in-time algorithm ! or more
281 # precisely, it IS the radix-2 decimation-in-time algorithm, with size N/2,
282 # applied on a vector representing the even input coefficients, and giving
283 # an output vector that is the concatenation of tmp0 and tmp1.
284 # This is important to notice, because this means we can now choose to
285 # calculate tmp0 and tmp1 using any DFT algorithm that we want, and we know
286 # that some of them are more efficient than radix-2...
288 # This leads us directly to the split-radix decimation-in-time algorithm :
290 def unscaled_DFT_split_radix_time (N
, input, output
):
291 even_input
= vector(N
/2)
292 input_1
= vector(N
/4)
293 input_3
= vector(N
/4)
294 even_output
= vector(N
/2)
295 output_1
= vector(N
/4)
296 output_3
= vector(N
/4)
301 even_input
[i
] = input[2*i
]
304 input_1
[i
] = input[4*i
+1]
305 input_3
[i
] = input[4*i
+3]
307 unscaled_DFT (N
/2, even_input
, even_output
)
308 unscaled_DFT (N
/4, input_1
, output_1
)
309 unscaled_DFT (N
/4, input_3
, output_3
)
312 output_1
[i
] = output_1
[i
] * W (i
, N
)
313 output_3
[i
] = output_3
[i
] * W (3*i
, N
)
316 tmp_0
[i
] = output_1
[i
] + output_3
[i
]
317 tmp_1
[i
] = output_1
[i
] - output_3
[i
]
320 output
[i
] = even_output
[i
] + tmp_0
[i
]
321 output
[i
+N
/4] = even_output
[i
+N
/4] - 1j
* tmp_1
[i
]
322 output
[i
+N
/2] = even_output
[i
] - tmp_0
[i
]
323 output
[i
+3*N
/4] = even_output
[i
+N
/4] + 1j
* tmp_1
[i
]
325 # This function performs one DFT of size N/2 and two of size N/4, followed by
326 # N/2 complex multiplies and 3*N/2 complex additions.
327 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
328 # multiplies... we will skip 2 for i=0. Also for i=N/8 the W(i,N) and W(3*i,N)
329 # multiplies can be implemented using only 2 real additions and 2 real
333 # We can similarly define the split-radix decimation-in-frequency DFT :
335 def unscaled_DFT_split_radix_freq (N
, input, output
):
336 even_input
= vector(N
/2)
337 input_1
= vector(N
/4)
338 input_3
= vector(N
/4)
339 even_output
= vector(N
/2)
340 output_1
= vector(N
/4)
341 output_3
= vector(N
/4)
346 even_input
[i
] = input[i
] + input[i
+N
/2]
349 tmp_0
[i
] = input[i
] - input[i
+N
/2]
350 tmp_1
[i
] = input[i
+N
/4] - input[i
+3*N
/4]
353 input_1
[i
] = tmp_0
[i
] - 1j
* tmp_1
[i
]
354 input_3
[i
] = tmp_0
[i
] + 1j
* tmp_1
[i
]
357 input_1
[i
] = input_1
[i
] * W (i
, N
)
358 input_3
[i
] = input_3
[i
] * W (3*i
, N
)
360 unscaled_DFT (N
/2, even_input
, even_output
)
361 unscaled_DFT (N
/4, input_1
, output_1
)
362 unscaled_DFT (N
/4, input_3
, output_3
)
365 output
[2*i
] = even_output
[i
]
368 output
[4*i
+1] = output_1
[i
]
369 output
[4*i
+3] = output_3
[i
]
371 # The complexity is again the same as for the decimation-in-time variant.
374 # Now let us now summarize our various algorithms for DFT decomposition :
376 # radix-2 : DFT(N) -> 2*DFT(N/2) using N/2 multiplies and N additions
377 # radix-4 : DFT(N) -> 4*DFT(N/2) using 3*N/4 multiplies and 2*N additions
378 # split-radix : DFT(N) -> DFT(N/2) + 2*DFT(N/4) using N/2 muls and 3*N/2 adds
380 # (we are always speaking of complex multiplies and complex additions... a
381 # complex addition is implemented with 2 real additions, and a complex
382 # multiply is implemented with either 2 adds and 4 muls or 3 adds and 3 muls,
383 # so we will keep a separate count of these)
385 # If we want to take into account the special values of W(i,N), we can remove
386 # a few complex multiplies. Supposing N>=16 we can remove :
387 # radix-2 : remove 2 complex multiplies, simplify 2 others
388 # radix-4 : remove 4 complex multiplies, simplify 4 others
389 # split-radix : remove 2 complex multiplies, simplify 2 others
391 # This gives the following table for the complexity of a complex DFT :
392 # N real additions real multiplies complex multiplies
403 # 1024 21160 680 2164
404 # 2048 46420 1364 5008
405 # 4096 101032 2728 11380
406 # 8192 218452 5460 25488
407 # 16384 469672 10920 56436
408 # 32768 1004884 21844 123792
409 # 65536 2140840 43688 269428
411 # If we chose to implement complex multiplies with 3 real muls + 3 real adds,
412 # then these results are consistent with the table at the end of the
413 # www.cmlab.csie.ntu.edu.tw DFT tutorial that I mentionned earlier.
416 # Now another important case for the DFT is the one where the inputs are
417 # real numbers instead of complex ones. We have to find ways to optimize for
418 # this important case.
420 # If the DFT inputs are real-valued, then the DFT outputs have nice properties
421 # too : output[0] and output[N/2] will be real numbers, and output[N-i] will
422 # be the conjugate of output[i] for i in 0...N/2-1
424 # Likewise, if the DFT inputs are purely imaginary numbers, then the DFT
425 # outputs will have special properties too : output[0] and output[N/2] will be
426 # purely imaginary, and output[N-i] will be the opposite of the conjugate of
427 # output[i] for i in 0...N/2-1
429 # We can use these properties to calculate two real-valued DFT at once :
431 def two_real_unscaled_DFT (N
, input1
, input2
, output1
, output2
):
436 input[i
] = input1
[i
] + 1j
* input2
[i
]
438 unscaled_DFT (N
, input, output
)
440 output1
[0] = output
[0].real
+ 0j
441 output2
[0] = output
[0].imag
+ 0j
443 for i
in range(N
/2)[1:]:
444 output1
[i
] = 0.5 * (output
[i
] + conjugate(output
[N
-i
]))
445 output2
[i
] = -0.5j
* (output
[i
] - conjugate(output
[N
-i
]))
447 output1
[N
-i
] = conjugate(output1
[i
])
448 output2
[N
-i
] = conjugate(output2
[i
])
450 output1
[N
/2] = output
[N
/2].real
+ 0j
451 output2
[N
/2] = output
[N
/2].imag
+ 0j
453 # This routine does a total of N-2 complex additions and N-2 complex
456 # This routine can also be inverted to calculate the IDFT of two vectors at
457 # once if we know that the outputs will be real-valued.
460 # If we have only one real-valued DFT calculation to do, we can still cut this
461 # calculation in several parts using one of the decimate-in-time methods
462 # (so that the different parts are still real-valued)
464 # As with complex DFT calculations, the best method is to use a split radix.
465 # There are a lot of symetries in the DFT outputs that we can exploit to
466 # reduce the number of operations...
468 def real_unscaled_DFT_split_radix_time_1 (N
, input, output
):
469 even_input
= vector(N
/2)
470 even_output
= vector(N
/2)
471 input_1
= vector(N
/4)
472 output_1
= vector(N
/4)
473 input_3
= vector(N
/4)
474 output_3
= vector(N
/4)
479 even_input
[i
] = input[2*i
]
482 input_1
[i
] = input[4*i
+1]
483 input_3
[i
] = input[4*i
+3]
485 unscaled_DFT (N
/2, even_input
, even_output
)
486 # this is again a real DFT !
487 # we will only use even_output[i] for i in 0 ... N/4 included. we know that
488 # even_output[N/2-i] is the conjugate of even_output[i]... also we know
489 # that even_output[0] and even_output[N/4] are purely real.
491 unscaled_DFT (N
/4, input_1
, output_1
)
492 unscaled_DFT (N
/4, input_3
, output_3
)
493 # these are real DFTs too... with symetries in the outputs... once again
495 tmp_0
[0] = output_1
[0] + output_3
[0] # real numbers
496 tmp_1
[0] = output_1
[0] - output_3
[0] # real numbers
498 tmp__0
= (output_1
[N
/8] + output_3
[N
/8]) * sqrt(0.5) # real numbers
499 tmp__1
= (output_1
[N
/8] - output_3
[N
/8]) * sqrt(0.5) # real numbers
500 tmp_0
[N
/8] = tmp__1
- 1j
* tmp__0
# real + 1j * real
501 tmp_1
[N
/8] = tmp__0
- 1j
* tmp__1
# real + 1j * real
503 for i
in range(N
/8)[1:]:
504 output_1
[i
] = output_1
[i
] * W (i
, N
)
505 output_3
[i
] = output_3
[i
] * W (3*i
, N
)
507 tmp_0
[i
] = output_1
[i
] + output_3
[i
]
508 tmp_1
[i
] = output_1
[i
] - output_3
[i
]
510 tmp_0
[N
/4-i
] = -1j
* conjugate(tmp_1
[i
])
511 tmp_1
[N
/4-i
] = -1j
* conjugate(tmp_0
[i
])
513 output
[0] = even_output
[0] + tmp_0
[0] # real numbers
514 output
[N
/4] = even_output
[N
/4] - 1j
* tmp_1
[0] # real + 1j * real
515 output
[N
/2] = even_output
[0] - tmp_0
[0] # real numbers
516 output
[3*N
/4] = even_output
[N
/4] + 1j
* tmp_1
[0] # real + 1j * real
518 for i
in range(N
/4)[1:]:
519 output
[i
] = even_output
[i
] + tmp_0
[i
]
520 output
[i
+N
/4] = conjugate(even_output
[N
/4-i
]) - 1j
* tmp_1
[i
]
522 output
[N
-i
] = conjugate(output
[i
])
523 output
[3*N
/4-i
] = conjugate(output
[i
+N
/4])
525 # This function performs one real DFT of size N/2 and two real DFT of size
526 # N/4, followed by 6 real additions, 2 real multiplies, 3*N/4-4 complex
527 # additions and N/4-2 complex multiplies.
530 # We can also try to combine the two real DFT of size N/4 into a single complex
533 def real_unscaled_DFT_split_radix_time_2 (N
, input, output
):
534 even_input
= vector(N
/2)
535 even_output
= vector(N
/2)
536 odd_input
= vector(N
/4)
537 odd_output
= vector(N
/4)
542 even_input
[i
] = input[2*i
]
545 odd_input
[i
] = input[4*i
+1] + 1j
* input[4*i
+3]
547 unscaled_DFT (N
/2, even_input
, even_output
)
548 # this is again a real DFT !
549 # we will only use even_output[i] for i in 0 ... N/4 included. we know that
550 # even_output[N/2-i] is the conjugate of even_output[i]... also we know
551 # that even_output[0] and even_output[N/4] are purely real.
553 unscaled_DFT (N
/4, odd_input
, odd_output
)
554 # but this one is a complex DFT so no special properties here
556 output_1
= odd_output
[0].real
557 output_3
= odd_output
[0].imag
558 tmp_0
[0] = output_1
+ output_3
# real numbers
559 tmp_1
[0] = output_1
- output_3
# real numbers
561 output_1
= odd_output
[N
/8].real
562 output_3
= odd_output
[N
/8].imag
563 tmp__0
= (output_1
+ output_3
) * sqrt(0.5) # real numbers
564 tmp__1
= (output_1
- output_3
) * sqrt(0.5) # real numbers
565 tmp_0
[N
/8] = tmp__1
- 1j
* tmp__0
# real + 1j * real
566 tmp_1
[N
/8] = tmp__0
- 1j
* tmp__1
# real + 1j * real
568 for i
in range(N
/8)[1:]:
569 output_1
= odd_output
[i
] + conjugate(odd_output
[N
/4-i
])
570 output_3
= odd_output
[i
] - conjugate(odd_output
[N
/4-i
])
572 output_1
= output_1
* 0.5 * W (i
, N
)
573 output_3
= output_3
* -0.5j
* W (3*i
, N
)
575 tmp_0
[i
] = output_1
+ output_3
576 tmp_1
[i
] = output_1
- output_3
578 tmp_0
[N
/4-i
] = -1j
* conjugate(tmp_1
[i
])
579 tmp_1
[N
/4-i
] = -1j
* conjugate(tmp_0
[i
])
581 output
[0] = even_output
[0] + tmp_0
[0] # real numbers
582 output
[N
/4] = even_output
[N
/4] - 1j
* tmp_1
[0] # real + 1j * real
583 output
[N
/2] = even_output
[0] - tmp_0
[0] # real numbers
584 output
[3*N
/4] = even_output
[N
/4] + 1j
* tmp_1
[0] # real + 1j * real
586 for i
in range(N
/4)[1:]:
587 output
[i
] = even_output
[i
] + tmp_0
[i
]
588 output
[i
+N
/4] = conjugate(even_output
[N
/4-i
]) - 1j
* tmp_1
[i
]
590 output
[N
-i
] = conjugate(output
[i
])
591 output
[3*N
/4-i
] = conjugate(output
[i
+N
/4])
593 # This function performs one real DFT of size N/2 and one complex DFT of size
594 # N/4, followed by 6 real additions, 2 real multiplies, N-6 complex additions
595 # and N/4-2 complex multiplies.
597 # After comparing the performance, it turns out that for real-valued DFT, the
598 # version of the algorithm that subdivides the calculation into one real
599 # DFT of size N/2 and two real DFT of size N/4 is the most efficient one.
600 # The other version gives exactly the same number of multiplies and a few more
604 # Now we can also try the decimate-in-frequency method for a real-valued DFT.
605 # Using the split-radix algorithm, and by taking into account the symetries of
608 def real_unscaled_DFT_split_radix_freq (N
, input, output
):
609 even_input
= vector(N
/2)
610 input_1
= vector(N
/4)
611 even_output
= vector(N
/2)
612 output_1
= vector(N
/4)
617 even_input
[i
] = input[i
] + input[i
+N
/2]
620 tmp_0
[i
] = input[i
] - input[i
+N
/2]
621 tmp_1
[i
] = input[i
+N
/4] - input[i
+3*N
/4]
624 input_1
[i
] = tmp_0
[i
] - 1j
* tmp_1
[i
]
627 input_1
[i
] = input_1
[i
] * W (i
, N
)
629 unscaled_DFT (N
/2, even_input
, even_output
)
630 # This is still a real-valued DFT
632 unscaled_DFT (N
/4, input_1
, output_1
)
633 # But that one is a complex-valued DFT
636 output
[2*i
] = even_output
[i
]
639 output
[4*i
+1] = output_1
[i
]
640 output
[N
-1-4*i
] = conjugate(output_1
[i
])
642 # I think this implementation is much more elegant than the decimate-in-time
643 # version ! It looks very much like the complex-valued version, all we had to
644 # do was remove one of the complex-valued internal DFT calls because we could
645 # deduce the outputs by using the symetries of the problem.
647 # As for performance, we did N real additions, N/4 complex multiplies (a bit
648 # less actually, because W(0,N) = 1 and W(N/8,N) is a "simple" multiply), then
649 # one real DFT of size N/2 and one complex DFT of size N/4.
651 # It turns out that even if the methods are so different, the number of
652 # operations is exactly the same as for the best of the two decimation-in-time
653 # methods that we tried.
656 # This gives us the following performance for real-valued DFT :
657 # N real additions real multiplies complex multiplies
668 # 2048 21164 682 2504
669 # 4096 46422 1364 5690
670 # 8192 101036 2730 12744
671 # 16384 218454 5460 28218
672 # 32768 469676 10922 61896
673 # 65536 1004886 21844 134714
676 # As an example, this is an implementation of the real-valued DFT8 :
678 def DFT8 (input, output
):
679 even_0
= input[0] + input[4]
680 even_1
= input[1] + input[5]
681 even_2
= input[2] + input[6]
682 even_3
= input[3] + input[7]
684 tmp_0
= even_0
+ even_2
685 tmp_1
= even_0
- even_2
686 tmp_2
= even_1
+ even_3
687 tmp_3
= even_1
- even_3
689 output
[0] = tmp_0
+ tmp_2
690 output
[2] = tmp_1
- 1j
* tmp_3
691 output
[4] = tmp_0
- tmp_2
693 odd_0_r
= input[0] - input[4]
694 odd_0_i
= input[2] - input[6]
696 tmp_0
= input[1] - input[5]
697 tmp_1
= input[3] - input[7]
698 odd_1_r
= (tmp_0
- tmp_1
) * sqrt(0.5)
699 odd_1_i
= (tmp_0
+ tmp_1
) * sqrt(0.5)
701 output
[1] = (odd_0_r
+ odd_1_r
) - 1j
* (odd_0_i
+ odd_1_i
)
702 output
[5] = (odd_0_r
- odd_1_r
) - 1j
* (odd_0_i
- odd_1_i
)
704 output
[3] = conjugate(output
[5])
705 output
[6] = conjugate(output
[2])
706 output
[7] = conjugate(output
[1])
709 # Also a basic implementation of the real-valued DFT4 :
711 def DFT4 (input, output
):
712 tmp_0
= input[0] + input[2]
713 tmp_1
= input[0] - input[2]
714 tmp_2
= input[1] + input[3]
715 tmp_3
= input[1] - input[3]
717 output
[0] = tmp_0
+ tmp_2
718 output
[1] = tmp_1
- 1j
* tmp_3
719 output
[2] = tmp_0
- tmp_2
720 output
[3] = tmp_1
+ 1j
* tmp_3
723 # A similar idea might be used to calculate only the real part of the output
724 # of a complex DFT : we take an DFT algorithm for real inputs and complex
725 # outputs and we simply reverse it. The resulting algorithm will only work
726 # with inputs that satisfy the conjugaison rule (input[i] is the conjugate of
727 # input[N-i]) so we can do a first pass to modify the input so that it follows
728 # this rule. An example implementation is as follows (adapted from the
729 # unscaled_DFT_split_radix_time algorithm) :
731 def complex2real_unscaled_DFT_split_radix_time (N
, input, output
):
732 even_input
= vector(N
/2)
733 input_1
= vector(N
/4)
734 even_output
= vector(N
/2)
735 output_1
= vector(N
/4)
738 even_input
[i
] = input[2*i
]
741 input_1
[i
] = input[4*i
+1] + conjugate(input[N
-1-4*i
])
743 unscaled_DFT (N
/2, even_input
, even_output
)
744 unscaled_DFT (N
/4, input_1
, output_1
)
747 output_1
[i
] = output_1
[i
] * W (i
, N
)
750 output
[i
] = even_output
[i
] + output_1
[i
].real
751 output
[i
+N
/4] = even_output
[i
+N
/4] + output_1
[i
].imag
752 output
[i
+N
/2] = even_output
[i
] - output_1
[i
].real
753 output
[i
+3*N
/4] = even_output
[i
+N
/4] - output_1
[i
].imag
755 # This algorithm does N/4 complex additions, N/4-1 complex multiplies
756 # (including one "simple" multiply for i=N/8), N real additions, one
757 # "complex-to-real" DFT of size N/2, and one complex DFT of size N/4.
758 # Also, in the complex DFT of size N/4, we do not care about the imaginary
759 # part of output_1[0], which in practice allows us to save one real addition.
761 # This gives us the following performance for complex DFT with real outputs :
762 # N real additions real multiplies complex multiplies
773 # 1024 10572 340 1082
774 # 2048 23201 682 2504
775 # 4096 50506 1364 5690
776 # 8192 109215 2730 12744
777 # 16384 234824 5460 28218
778 # 32768 502429 10922 61896
779 # 65536 1070406 21844 134714
782 # Now let's talk about the DCT algorithm. The canonical definition for it is
786 return cos ((k
*pi
)/(2*N
))
788 def unscaled_DCT (N
, input, output
):
789 for o
in range(N
): # o is output index
791 for i
in range(N
): # i is input index
792 output
[o
] = output
[o
] + input[i
] * C ((2*i
+1)*o
, N
)
794 # This trivial algorithm uses N*N multiplications and N*(N-1) additions.
797 # One possible decomposition on this calculus is to use the fact that C (i, N)
798 # and C (2*N-i, N) are opposed. This can lead to this decomposition :
800 #def unscaled_DCT (N, input, output):
801 # even_input = vector (N)
802 # odd_input = vector (N)
803 # even_output = vector (N)
804 # odd_output = vector (N)
806 # for i in range(N/2):
807 # even_input[i] = input[i] + input[N-1-i]
808 # odd_input[i] = input[i] - input[N-1-i]
810 # unscaled_DCT (N, even_input, even_output)
811 # unscaled_DCT (N, odd_input, odd_output)
813 # for i in range(N/2):
814 # output[2*i] = even_output[2*i]
815 # output[2*i+1] = odd_output[2*i+1]
817 # Now the even part can easily be calculated : by looking at the C(k,N)
818 # formula, we see that the even part is actually an unscaled DCT of size N/2.
819 # The odd part looks like a DCT of size N/2, but the coefficients are
820 # actually C ((2*i+1)*(2*o+1), 2*N) instead of C ((2*i+1)*o, N).
822 # We use a trigonometric relation here :
823 # 2 * C ((a+b)/2, N) * C ((a-b)/2, N) = C (a, N) + C (b, N)
824 # Thus with a = (2*i+1)*o and b = (2*i+1)*(o+1) :
825 # 2 * C((2*i+1)*(2*o+1),2N) * C(2*i+1,2N) = C((2*i+1)*o,N) + C((2*i+1)*(o+1),N)
827 # This leads us to the Lee DCT algorithm :
829 def unscaled_DCT_Lee (N
, input, output
):
830 even_input
= vector(N
/2)
831 odd_input
= vector(N
/2)
832 even_output
= vector(N
/2)
833 odd_output
= vector(N
/2)
836 even_input
[i
] = input[i
] + input[N
-1-i
]
837 odd_input
[i
] = input[i
] - input[N
-1-i
]
840 odd_input
[i
] = odd_input
[i
] * (0.5 / C (2*i
+1, N
))
842 unscaled_DCT (N
/2, even_input
, even_output
)
843 unscaled_DCT (N
/2, odd_input
, odd_output
)
845 for i
in range(N
/2-1):
846 odd_output
[i
] = odd_output
[i
] + odd_output
[i
+1]
849 output
[2*i
] = even_output
[i
]
850 output
[2*i
+1] = odd_output
[i
];
852 # Notes about this algorithm :
854 # The algorithm can be easily inverted to calculate the IDCT instead :
855 # each of the basic stages are separately inversible...
857 # This function does N adds, then N/2 muls, then 2 recursive calls with
858 # size N/2, then N/2-1 adds again. If we apply it recursively, the total
859 # number of operations will be N*log2(N)/2 multiplies and N*(3*log2(N)/2-1) + 1
860 # additions. So this is much faster than the canonical algorithm.
862 # Some of the multiplication coefficients 0.5/cos(...) can get quite large.
863 # This means that a small error in the input will give a large error on the
864 # output... For a DCT of size N the biggest coefficient will be at i=N/2-1
865 # and it will be slightly more than N/pi which can be large for large N's.
867 # In the IDCT however, the multiplication coefficients for the reverse
868 # transformation are of the form 2*cos(...) so they can not get big and there
869 # is no accuracy problem.
871 # You can find another description of this algorithm at
872 # http://www.intel.com/drg/mmx/appnotes/ap533.htm
876 # Another idea is to observe that the DCT calculation can be made to look like
877 # the DFT calculation : C (k, N) is the real part of W (k, 4*N) or W (-k, 4*N).
878 # We can use this idea translate the DCT algorithm into a call to the DFT
881 def unscaled_DCT_DFT (N
, input, output
):
882 DFT_input
= vector (4*N
)
883 DFT_output
= vector (4*N
)
886 DFT_input
[2*i
+1] = input[i
]
887 #DFT_input[4*N-2*i-1] = input[i] # We could use this instead
889 unscaled_DFT (4*N
, DFT_input
, DFT_output
)
892 output
[i
] = DFT_output
[i
].real
895 # We can then use our knowledge of the DFT calculation to optimize for this
896 # particular case. For example using the radix-2 decimation-in-time method :
898 #def unscaled_DCT_DFT (N, input, output):
899 # DFT_input = vector (2*N)
900 # DFT_output = vector (2*N)
903 # DFT_input[i] = input[i]
904 # #DFT_input[2*N-1-i] = input[i] # We could use this instead
906 # unscaled_DFT (2*N, DFT_input, DFT_output)
909 # DFT_output[i] = DFT_output[i] * W (i, 4*N)
912 # output[i] = DFT_output[i].real
914 # This leads us to the AAN implementation of the DCT algorithm : if we set
915 # both DFT_input[i] and DFT_input[2*N-1-i] to input[i], then the imaginary
916 # parts of W(2*i+1) and W(-2*i-1) will compensate, and output_DFT[i] will
917 # already be a real after the multiplication by W(i,4*N). Which means that
918 # before the multiplication, it is the product of a real number and W(-i,4*N).
919 # This leads to the following code, called the AAN algorithm :
921 def unscaled_DCT_AAN (N
, input, output
):
922 DFT_input
= vector (2*N
)
923 DFT_output
= vector (2*N
)
926 DFT_input
[i
] = input[i
]
927 DFT_input
[2*N
-1-i
] = input[i
]
929 symetrical_unscaled_DFT (2*N
, DFT_input
, DFT_output
)
932 output
[i
] = DFT_output
[i
].real
* (0.5 / C (i
, N
))
934 # Notes about the AAN algorithm :
936 # The cost of this function is N real multiplies and a DFT of size 2*N. The
937 # DFT to calculate has special properties : the inputs are real and symmetric.
938 # Also, we only need to calculate the real parts of the N first DFT outputs.
939 # We can try to take advantage of all that.
941 # We can invert this algorithm to calculate the IDCT. The final multiply
942 # stage is trivially invertible. The DFT stage is invertible too, but we have
943 # to take into account the special properties of this particular DFT for that.
945 # Once again we have to take care of numerical precision for the DFT : the
946 # output coefficients can get large, so that a small error in the input will
947 # give a large error on the output... For a DCT of size N the biggest
948 # coefficient will be at i=N/2-1 and it will be slightly more than N/pi
950 # You can find another description of this algorithm at this url :
951 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fastdct.html
952 # (It is the same server where we already found a description of the fast DFT)
955 # To optimize the DFT calculation, we can take a lot of specific things into
956 # account : the input is real and symetric, and we only care about the real
957 # part of the output. Also, we only care about the N first output coefficients,
958 # but that one does not save operations actually, because the other
959 # coefficients are the conjugates of the ones we look anyway.
961 # One useful way to use the symmetry of the input is to use the radix-2
962 # decimation-in-frequency algorithm. We can write a version of
963 # unscaled_DFT_radix2_freq for the case where the input is symmetrical :
964 # we have removed a few additions in the first stages because even_input
965 # is symmetrical and odd_input is antisymetrical. Also, we have modified the
966 # odd_input vector so that the second half of it is set to zero and the real
967 # part of the DFT output is not modified. After that modification, the second
968 # part of the odd_input was null so we used the radix-2 decimation-in-frequency
969 # again on the odd DFT. Also odd_output is symmetrical because input is real...
971 def symetrical_unscaled_DFT (N
, input, output
):
972 even_input
= vector(N
/2)
973 odd_tmp
= vector(N
/2)
974 odd_input
= vector(N
/2)
975 even_output
= vector(N
/2)
976 odd_output
= vector(N
/2)
979 even_input
[N
/2-i
-1] = even_input
[i
] = input[i
] + input[N
/2-1-i
]
982 odd_tmp
[i
] = input[i
] - input[N
/2-1-i
]
984 odd_input
[0] = odd_tmp
[0]
985 for i
in range(N
/4)[1:]:
986 odd_input
[i
] = (odd_tmp
[i
] + odd_tmp
[i
-1]) * W (i
, N
)
988 unscaled_DFT (N
/2, even_input
, even_output
)
989 # symmetrical real inputs, real outputs
991 unscaled_DFT (N
/4, odd_input
, odd_output
)
992 # complex inputs, real outputs
995 output
[2*i
] = even_output
[i
]
998 output
[N
-1-4*i
] = output
[4*i
+1] = odd_output
[i
]
1000 # This procedure takes 3*N/4-1 real additions and N/2-3 real multiplies,
1001 # followed by another symmetrical real DFT of size N/2 and a "complex to real"
1004 # We thus get the following performance results :
1005 # N real additions real multiplies complex multiplies
1016 # 1024 5251 1158 294
1017 # 2048 11557 2349 750
1018 # 4096 25200 4734 1832
1019 # 8192 54544 9509 4336
1020 # 16384 117337 19062 10026
1021 # 32768 251127 38173 22770
1022 # 65536 535102 76398 50988
1025 # We thus get a better performance with the AAN DCT algorithm than with the
1026 # Lee DCT algorithm : we can do a DCT of size 32 with 189 additions, 54+32 real
1027 # multiplies, and 2 complex multiplies. The Lee algorithm would have used 209
1028 # additions and 80 multiplies. With the AAN algorithm, we also have the
1029 # advantage that a big number of the multiplies are actually grouped at the
1030 # output stage of the algorithm, so if we want to do a DCT followed by a
1031 # quantization stage, we will be able to group the multiply of the output with
1032 # the multiply of the quantization stage, thus saving 32 more operations. In
1033 # the mpeg audio layer 1 or 2 processing, we can also group the multiply of the
1034 # output with the multiply of the convolution stage...
1036 # Another source code for the AAN algorithm (implemented on 8 points, and
1037 # without all of the explanations) can be found at this URL :
1038 # http://developer.intel.com/drg/pentiumII/appnotes/aan_org.c . This
1039 # implementation uses 28 adds and 6+8 muls instead of 29 adds and 5+8 muls -
1040 # the difference is that in the symetrical_unscaled_DFT procedure, they noticed
1041 # how odd_input[i] and odd_input[N/4-i] will be combined at the start of the
1042 # complex-to-real DFT and they took advantage of this to convert 2 real adds
1043 # and 4 real muls into one complex multiply.
1046 # TODO : write about multi-dimentional DCT
1053 for i
in range(len(vector
)):
1056 vector
[i
] = vector
[i
] + 0j
1057 realstr
= "%+.4f" % vector
[i
].real
1058 imagstr
= "%+.4fj" % vector
[i
].imag
1059 if (realstr
== "-0.0000"):
1061 if (imagstr
== "-0.0000j"):
1062 imagstr
= "+0.0000j"
1063 str = str + realstr
#+ imagstr
1072 input[i
] = random() + 1j
* random()
1074 unscaled_DFT (N
, input, output
)
1075 unscaled_DFT (N
, input, verify
)
1077 if (dump(output
) != dump(verify
)):
1084 # PERFORMANCE ANALYSIS CODE
1086 def display (table
):
1088 print "#\tN\treal additions\treal multiplies\tcomplex multiplies"
1089 while table
.has_key(N
):
1090 print "#%8d%16d%16d%16d" % (N
, table
[N
][0], table
[N
][1], table
[N
][2])
1094 best_complex_DFT
= {}
1096 def complex_DFT (max_N
):
1097 best_complex_DFT
[1] = (0,0,0)
1098 best_complex_DFT
[2] = (4,0,0)
1099 best_complex_DFT
[4] = (16,0,0)
1102 # best method = split radix
1103 best2
= best_complex_DFT
[N
/2]
1104 best4
= best_complex_DFT
[N
/4]
1105 best_complex_DFT
[N
] = (best2
[0] + 2*best4
[0] + 3*N
+ 4,
1106 best2
[1] + 2*best4
[1] + 4,
1107 best2
[2] + 2*best4
[2] + N
/2 - 4)
1112 def real_DFT (max_N
):
1113 best_real_DFT
[1] = (0,0,0)
1114 best_real_DFT
[2] = (2,0,0)
1115 best_real_DFT
[4] = (6,0,0)
1118 # best method = split radix decimate-in-frequency
1119 best2
= best_real_DFT
[N
/2]
1120 best4
= best_complex_DFT
[N
/4]
1121 best_real_DFT
[N
] = (best2
[0] + best4
[0] + N
+ 2,
1122 best2
[1] + best4
[1] + 2,
1123 best2
[2] + best4
[2] + N
/4 - 2)
1126 best_complex2real_DFT
= {}
1128 def complex2real_DFT (max_N
):
1129 best_complex2real_DFT
[1] = (0,0,0)
1130 best_complex2real_DFT
[2] = (2,0,0)
1131 best_complex2real_DFT
[4] = (8,0,0)
1134 best2
= best_complex2real_DFT
[N
/2]
1135 best4
= best_complex_DFT
[N
/4]
1136 best_complex2real_DFT
[N
] = (best2
[0] + best4
[0] + 3*N
/2 + 1,
1137 best2
[1] + best4
[1] + 2,
1138 best2
[2] + best4
[2] + N
/4 - 2)
1141 best_real_symetric_DFT
= {}
1143 def real_symetric_DFT (max_N
):
1144 best_real_symetric_DFT
[1] = (0,0,0)
1145 best_real_symetric_DFT
[2] = (0,0,0)
1146 best_real_symetric_DFT
[4] = (2,0,0)
1149 best2
= best_real_symetric_DFT
[N
/2]
1150 best4
= best_complex2real_DFT
[N
/4]
1151 best_real_symetric_DFT
[N
] = (best2
[0] + best4
[0] + 3*N
/4 - 1,
1152 best2
[1] + best4
[1] + N
/2 - 3,
1153 best2
[2] + best4
[2])
1158 complex2real_DFT (65536)
1159 real_symetric_DFT (65536)
1163 display (best_complex_DFT
)
1166 display (best_real_DFT
)
1168 print "complex2real DFT"
1169 display (best_complex2real_DFT
)
1171 print "real symetric DFT"
1172 display (best_real_symetric_DFT
)