* modules/codec/ffmpeg/encoder.c: compilation fix for ffmpeg 0.4.8 (not sure it's...
[vlc.git] / doc / transforms.py
blob531fdd358e727284860a0f697e188daa5f72126e
1 # Lossy compression algorithms very often make use of DCT or DFT calculations,
2 # or variations of these calculations. This file is intended to be a short
3 # reference about classical DCT and DFT algorithms.
6 import math
7 import cmath
9 pi = math.pi
10 sin = math.sin
11 cos = math.cos
12 sqrt = math.sqrt
14 def exp_j (alpha):
15 return cmath.exp (alpha * 1j)
17 def conjugate (c):
18 c = c + 0j
19 return c.real - 1j * c.imag
21 def vector (N):
22 return [0j] * N
25 # Let us start withthe canonical definition of the unscaled DFT algorithm :
26 # (I can not draw sigmas in a text file so I'll use python code instead) :)
28 def W (k, N):
29 return exp_j ((-2*pi*k)/N)
31 def unscaled_DFT (N, input, output):
32 for o in range(N): # o is output index
33 output[o] = 0
34 for i in range(N):
35 output[o] = output[o] + input[i] * W (i*o, N)
37 # This algorithm takes complex input and output. There are N*N complex
38 # multiplications and N*(N-1) complex additions.
41 # Of course this algorithm is an extremely naive implementation and there are
42 # some ways to use the trigonometric properties of the coefficients to find
43 # some decompositions that can accelerate the calculation by several orders
44 # of magnitude... This is a well known and studied problem. One of the
45 # available explanations of this process is at this url :
46 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html
49 # Let's start with the radix-2 decimation-in-time algorithm :
51 def unscaled_DFT_radix2_time (N, input, output):
52 even_input = vector(N/2)
53 odd_input = vector(N/2)
54 even_output = vector(N/2)
55 odd_output = vector(N/2)
57 for i in range(N/2):
58 even_input[i] = input[2*i]
59 odd_input[i] = input[2*i+1]
61 unscaled_DFT (N/2, even_input, even_output)
62 unscaled_DFT (N/2, odd_input, odd_output)
64 for i in range(N/2):
65 odd_output[i] = odd_output[i] * W (i, N)
67 for i in range(N/2):
68 output[i] = even_output[i] + odd_output[i]
69 output[i+N/2] = even_output[i] - odd_output[i]
71 # This algorithm takes complex input and output.
73 # We divide the DFT calculation into 2 DFT calculations of size N/2
74 # We then do N/2 complex multiplies followed by N complex additions.
75 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
76 # multiplies... we will skip 1 for i=0 and 1 for i=N/4. Also for i=N/8 and for
77 # i=3*N/8 the W(i,N) values can be special-cased to implement the complex
78 # multiplication using only 2 real additions and 2 real multiplies)
80 # Also note that all the basic stages of this DFT algorithm are easily
81 # reversible, so we can calculate the IDFT with the same complexity.
84 # A varient of this is the radix-2 decimation-in-frequency algorithm :
86 def unscaled_DFT_radix2_freq (N, input, output):
87 even_input = vector(N/2)
88 odd_input = vector(N/2)
89 even_output = vector(N/2)
90 odd_output = vector(N/2)
92 for i in range(N/2):
93 even_input[i] = input[i] + input[i+N/2]
94 odd_input[i] = input[i] - input[i+N/2]
96 for i in range(N/2):
97 odd_input[i] = odd_input[i] * W (i, N)
99 unscaled_DFT (N/2, even_input, even_output)
100 unscaled_DFT (N/2, odd_input, odd_output)
102 for i in range(N/2):
103 output[2*i] = even_output[i]
104 output[2*i+1] = odd_output[i]
106 # Note that the decimation-in-time and the decimation-in-frequency varients
107 # have exactly the same complexity, they only do the operations in a different
108 # order.
110 # Actually, if you look at the decimation-in-time varient of the DFT, and
111 # reverse it to calculate an IDFT, you get something that is extremely close
112 # to the decimation-in-frequency DFT algorithm...
115 # The radix-4 algorithms are slightly more efficient : they take into account
116 # the fact that with complex numbers, multiplications by j and -j are also
117 # "free"... i.e. when you code them using real numbers, they translate into
118 # a few data moves but no real operation.
120 # Let's start with the radix-4 decimation-in-time algorithm :
122 def unscaled_DFT_radix4_time (N, input, output):
123 input_0 = vector(N/4)
124 input_1 = vector(N/4)
125 input_2 = vector(N/4)
126 input_3 = vector(N/4)
127 output_0 = vector(N/4)
128 output_1 = vector(N/4)
129 output_2 = vector(N/4)
130 output_3 = vector(N/4)
131 tmp_0 = vector(N/4)
132 tmp_1 = vector(N/4)
133 tmp_2 = vector(N/4)
134 tmp_3 = vector(N/4)
136 for i in range(N/4):
137 input_0[i] = input[4*i]
138 input_1[i] = input[4*i+1]
139 input_2[i] = input[4*i+2]
140 input_3[i] = input[4*i+3]
142 unscaled_DFT (N/4, input_0, output_0)
143 unscaled_DFT (N/4, input_1, output_1)
144 unscaled_DFT (N/4, input_2, output_2)
145 unscaled_DFT (N/4, input_3, output_3)
147 for i in range(N/4):
148 output_1[i] = output_1[i] * W (i, N)
149 output_2[i] = output_2[i] * W (2*i, N)
150 output_3[i] = output_3[i] * W (3*i, N)
152 for i in range(N/4):
153 tmp_0[i] = output_0[i] + output_2[i]
154 tmp_1[i] = output_0[i] - output_2[i]
155 tmp_2[i] = output_1[i] + output_3[i]
156 tmp_3[i] = output_1[i] - output_3[i]
158 for i in range(N/4):
159 output[i] = tmp_0[i] + tmp_2[i]
160 output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
161 output[i+N/2] = tmp_0[i] - tmp_2[i]
162 output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
164 # This algorithm takes complex input and output.
166 # We divide the DFT calculation into 4 DFT calculations of size N/4
167 # We then do 3*N/4 complex multiplies followed by 2*N complex additions.
168 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
169 # multiplies... we will skip 3 for i=0 and 1 for i=N/8. Also for i=N/8
170 # the remaining W(i,N) and W(3*i,N) multiplies can be implemented using only
171 # 2 real additions and 2 real multiplies. For i=N/16 and i=3*N/16 we can also
172 # optimise the W(2*i/N) multiply this way.
174 # If we wanted to do the same decomposition with one radix-2 decomposition
175 # of size N and 2 radix-2 decompositions of size N/2, the total cost would be
176 # N complex multiplies and 2*N complex additions. Thus we see that the
177 # decomposition of one DFT calculation of size N into 4 calculations of size
178 # N/4 using the radix-4 algorithm instead of the radix-2 algorithm saved N/4
179 # complex multiplies...
182 # The radix-4 decimation-in-frequency algorithm is similar :
184 def unscaled_DFT_radix4_freq (N, input, output):
185 input_0 = vector(N/4)
186 input_1 = vector(N/4)
187 input_2 = vector(N/4)
188 input_3 = vector(N/4)
189 output_0 = vector(N/4)
190 output_1 = vector(N/4)
191 output_2 = vector(N/4)
192 output_3 = vector(N/4)
193 tmp_0 = vector(N/4)
194 tmp_1 = vector(N/4)
195 tmp_2 = vector(N/4)
196 tmp_3 = vector(N/4)
198 for i in range(N/4):
199 tmp_0[i] = input[i] + input[i+N/2]
200 tmp_1[i] = input[i+N/4] + input[i+3*N/4]
201 tmp_2[i] = input[i] - input[i+N/2]
202 tmp_3[i] = input[i+N/4] - input[i+3*N/4]
204 for i in range(N/4):
205 input_0[i] = tmp_0[i] + tmp_1[i]
206 input_1[i] = tmp_2[i] - 1j * tmp_3[i]
207 input_2[i] = tmp_0[i] - tmp_1[i]
208 input_3[i] = tmp_2[i] + 1j * tmp_3[i]
210 for i in range(N/4):
211 input_1[i] = input_1[i] * W (i, N)
212 input_2[i] = input_2[i] * W (2*i, N)
213 input_3[i] = input_3[i] * W (3*i, N)
215 unscaled_DFT (N/4, input_0, output_0)
216 unscaled_DFT (N/4, input_1, output_1)
217 unscaled_DFT (N/4, input_2, output_2)
218 unscaled_DFT (N/4, input_3, output_3)
220 for i in range(N/4):
221 output[4*i] = output_0[i]
222 output[4*i+1] = output_1[i]
223 output[4*i+2] = output_2[i]
224 output[4*i+3] = output_3[i]
226 # Once again the complexity is exactly the same as for the radix-4
227 # decimation-in-time DFT algorithm, only the order of the operations is
228 # different.
231 # Now let us reorder the radix-4 algorithms in a different way :
233 #def unscaled_DFT_radix4_time (N, input, output):
234 # input_0 = vector(N/4)
235 # input_1 = vector(N/4)
236 # input_2 = vector(N/4)
237 # input_3 = vector(N/4)
238 # output_0 = vector(N/4)
239 # output_1 = vector(N/4)
240 # output_2 = vector(N/4)
241 # output_3 = vector(N/4)
242 # tmp_0 = vector(N/4)
243 # tmp_1 = vector(N/4)
244 # tmp_2 = vector(N/4)
245 # tmp_3 = vector(N/4)
247 # for i in range(N/4):
248 # input_0[i] = input[4*i]
249 # input_2[i] = input[4*i+2]
251 # unscaled_DFT (N/4, input_0, output_0)
252 # unscaled_DFT (N/4, input_2, output_2)
254 # for i in range(N/4):
255 # output_2[i] = output_2[i] * W (2*i, N)
257 # for i in range(N/4):
258 # tmp_0[i] = output_0[i] + output_2[i]
259 # tmp_1[i] = output_0[i] - output_2[i]
261 # for i in range(N/4):
262 # input_1[i] = input[4*i+1]
263 # input_3[i] = input[4*i+3]
265 # unscaled_DFT (N/4, input_1, output_1)
266 # unscaled_DFT (N/4, input_3, output_3)
268 # for i in range(N/4):
269 # output_1[i] = output_1[i] * W (i, N)
270 # output_3[i] = output_3[i] * W (3*i, N)
272 # for i in range(N/4):
273 # tmp_2[i] = output_1[i] + output_3[i]
274 # tmp_3[i] = output_1[i] - output_3[i]
276 # for i in range(N/4):
277 # output[i] = tmp_0[i] + tmp_2[i]
278 # output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
279 # output[i+N/2] = tmp_0[i] - tmp_2[i]
280 # output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
282 # We didnt do anything here, only reorder the operations. But now, look at the
283 # first part of this function, up to the calculations of tmp0 and tmp1 : this
284 # is extremely similar to the radix-2 decimation-in-time algorithm ! or more
285 # precisely, it IS the radix-2 decimation-in-time algorithm, with size N/2,
286 # applied on a vector representing the even input coefficients, and giving
287 # an output vector that is the concatenation of tmp0 and tmp1.
288 # This is important to notice, because this means we can now choose to
289 # calculate tmp0 and tmp1 using any DFT algorithm that we want, and we know
290 # that some of them are more efficient than radix-2...
292 # This leads us directly to the split-radix decimation-in-time algorithm :
294 def unscaled_DFT_split_radix_time (N, input, output):
295 even_input = vector(N/2)
296 input_1 = vector(N/4)
297 input_3 = vector(N/4)
298 even_output = vector(N/2)
299 output_1 = vector(N/4)
300 output_3 = vector(N/4)
301 tmp_0 = vector(N/4)
302 tmp_1 = vector(N/4)
304 for i in range(N/2):
305 even_input[i] = input[2*i]
307 for i in range(N/4):
308 input_1[i] = input[4*i+1]
309 input_3[i] = input[4*i+3]
311 unscaled_DFT (N/2, even_input, even_output)
312 unscaled_DFT (N/4, input_1, output_1)
313 unscaled_DFT (N/4, input_3, output_3)
315 for i in range(N/4):
316 output_1[i] = output_1[i] * W (i, N)
317 output_3[i] = output_3[i] * W (3*i, N)
319 for i in range(N/4):
320 tmp_0[i] = output_1[i] + output_3[i]
321 tmp_1[i] = output_1[i] - output_3[i]
323 for i in range(N/4):
324 output[i] = even_output[i] + tmp_0[i]
325 output[i+N/4] = even_output[i+N/4] - 1j * tmp_1[i]
326 output[i+N/2] = even_output[i] - tmp_0[i]
327 output[i+3*N/4] = even_output[i+N/4] + 1j * tmp_1[i]
329 # This function performs one DFT of size N/2 and two of size N/4, followed by
330 # N/2 complex multiplies and 3*N/2 complex additions.
331 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
332 # multiplies... we will skip 2 for i=0. Also for i=N/8 the W(i,N) and W(3*i,N)
333 # multiplies can be implemented using only 2 real additions and 2 real
334 # multiplies)
337 # We can similarly define the split-radix decimation-in-frequency DFT :
339 def unscaled_DFT_split_radix_freq (N, input, output):
340 even_input = vector(N/2)
341 input_1 = vector(N/4)
342 input_3 = vector(N/4)
343 even_output = vector(N/2)
344 output_1 = vector(N/4)
345 output_3 = vector(N/4)
346 tmp_0 = vector(N/4)
347 tmp_1 = vector(N/4)
349 for i in range(N/2):
350 even_input[i] = input[i] + input[i+N/2]
352 for i in range(N/4):
353 tmp_0[i] = input[i] - input[i+N/2]
354 tmp_1[i] = input[i+N/4] - input[i+3*N/4]
356 for i in range(N/4):
357 input_1[i] = tmp_0[i] - 1j * tmp_1[i]
358 input_3[i] = tmp_0[i] + 1j * tmp_1[i]
360 for i in range(N/4):
361 input_1[i] = input_1[i] * W (i, N)
362 input_3[i] = input_3[i] * W (3*i, N)
364 unscaled_DFT (N/2, even_input, even_output)
365 unscaled_DFT (N/4, input_1, output_1)
366 unscaled_DFT (N/4, input_3, output_3)
368 for i in range(N/2):
369 output[2*i] = even_output[i]
371 for i in range(N/4):
372 output[4*i+1] = output_1[i]
373 output[4*i+3] = output_3[i]
375 # The complexity is again the same as for the decimation-in-time varient.
378 # Now let us now summarize our various algorithms for DFT decomposition :
380 # radix-2 : DFT(N) -> 2*DFT(N/2) using N/2 multiplies and N additions
381 # radix-4 : DFT(N) -> 4*DFT(N/2) using 3*N/4 multiplies and 2*N additions
382 # split-radix : DFT(N) -> DFT(N/2) + 2*DFT(N/4) using N/2 muls and 3*N/2 adds
384 # (we are always speaking of complex multiplies and complex additions... a
385 # complex addition is implemented with 2 real additions, and a complex
386 # multiply is implemented with either 2 adds and 4 muls or 3 adds and 3 muls,
387 # so we will keep a separate count of these)
389 # If we want to take into account the special values of W(i,N), we can remove
390 # a few complex multiplies. Supposing N>=16 we can remove :
391 # radix-2 : remove 2 complex multiplies, simplify 2 others
392 # radix-4 : remove 4 complex multiplies, simplify 4 others
393 # split-radix : remove 2 complex multiplies, simplify 2 others
395 # This gives the following table for the complexity of a complex DFT :
396 # N real additions real multiplies complex multiplies
397 # 1 0 0 0
398 # 2 4 0 0
399 # 4 16 0 0
400 # 8 52 4 0
401 # 16 136 8 4
402 # 32 340 20 16
403 # 64 808 40 52
404 # 128 1876 84 144
405 # 256 4264 168 372
406 # 512 9556 340 912
407 # 1024 21160 680 2164
408 # 2048 46420 1364 5008
409 # 4096 101032 2728 11380
410 # 8192 218452 5460 25488
411 # 16384 469672 10920 56436
412 # 32768 1004884 21844 123792
413 # 65536 2140840 43688 269428
415 # If we chose to implement complex multiplies with 3 real muls + 3 real adds,
416 # then these results are consistent with the table at the end of the
417 # www.cmlab.csie.ntu.edu.tw DFT tutorial that I mentionned earlier.
420 # Now another important case for the DFT is the one where the inputs are
421 # real numbers instead of complex ones. We have to find ways to optimize for
422 # this important case.
424 # If the DFT inputs are real-valued, then the DFT outputs have nice properties
425 # too : output[0] and output[N/2] will be real numbers, and output[N-i] will
426 # be the conjugate of output[i] for i in 0...N/2-1
428 # Likewise, if the DFT inputs are purely imaginary numbers, then the DFT
429 # outputs will have special properties too : output[0] and output[N/2] will be
430 # purely imaginary, and output[N-i] will be the opposite of the conjugate of
431 # output[i] for i in 0...N/2-1
433 # We can use these properties to calculate two real-valued DFT at once :
435 def two_real_unscaled_DFT (N, input1, input2, output1, output2):
436 input = vector(N)
437 output = vector(N)
439 for i in range(N):
440 input[i] = input1[i] + 1j * input2[i]
442 unscaled_DFT (N, input, output)
444 output1[0] = output[0].real + 0j
445 output2[0] = output[0].imag + 0j
447 for i in range(N/2)[1:]:
448 output1[i] = 0.5 * (output[i] + conjugate(output[N-i]))
449 output2[i] = -0.5j * (output[i] - conjugate(output[N-i]))
451 output1[N-i] = conjugate(output1[i])
452 output2[N-i] = conjugate(output2[i])
454 output1[N/2] = output[N/2].real + 0j
455 output2[N/2] = output[N/2].imag + 0j
457 # This routine does a total of N-2 complex additions and N-2 complex
458 # multiplies by 0.5
460 # This routine can also be inverted to calculate the IDFT of two vectors at
461 # once if we know that the outputs will be real-valued.
464 # If we have only one real-valued DFT calculation to do, we can still cut this
465 # calculation in several parts using one of the decimate-in-time methods
466 # (so that the different parts are still real-valued)
468 # As with complex DFT calculations, the best method is to use a split radix.
469 # There are a lot of symetries in the DFT outputs that we can exploit to
470 # reduce the number of operations...
472 def real_unscaled_DFT_split_radix_time_1 (N, input, output):
473 even_input = vector(N/2)
474 even_output = vector(N/2)
475 input_1 = vector(N/4)
476 output_1 = vector(N/4)
477 input_3 = vector(N/4)
478 output_3 = vector(N/4)
479 tmp_0 = vector(N/4)
480 tmp_1 = vector(N/4)
482 for i in range(N/2):
483 even_input[i] = input[2*i]
485 for i in range(N/4):
486 input_1[i] = input[4*i+1]
487 input_3[i] = input[4*i+3]
489 unscaled_DFT (N/2, even_input, even_output)
490 # this is again a real DFT !
491 # we will only use even_output[i] for i in 0 ... N/4 included. we know that
492 # even_output[N/2-i] is the conjugate of even_output[i]... also we know
493 # that even_output[0] and even_output[N/4] are purely real.
495 unscaled_DFT (N/4, input_1, output_1)
496 unscaled_DFT (N/4, input_3, output_3)
497 # these are real DFTs too... with symetries in the outputs... once again
499 tmp_0[0] = output_1[0] + output_3[0] # real numbers
500 tmp_1[0] = output_1[0] - output_3[0] # real numbers
502 tmp__0 = (output_1[N/8] + output_3[N/8]) * sqrt(0.5) # real numbers
503 tmp__1 = (output_1[N/8] - output_3[N/8]) * sqrt(0.5) # real numbers
504 tmp_0[N/8] = tmp__1 - 1j * tmp__0 # real + 1j * real
505 tmp_1[N/8] = tmp__0 - 1j * tmp__1 # real + 1j * real
507 for i in range(N/8)[1:]:
508 output_1[i] = output_1[i] * W (i, N)
509 output_3[i] = output_3[i] * W (3*i, N)
511 tmp_0[i] = output_1[i] + output_3[i]
512 tmp_1[i] = output_1[i] - output_3[i]
514 tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
515 tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])
517 output[0] = even_output[0] + tmp_0[0] # real numbers
518 output[N/4] = even_output[N/4] - 1j * tmp_1[0] # real + 1j * real
519 output[N/2] = even_output[0] - tmp_0[0] # real numbers
520 output[3*N/4] = even_output[N/4] + 1j * tmp_1[0] # real + 1j * real
522 for i in range(N/4)[1:]:
523 output[i] = even_output[i] + tmp_0[i]
524 output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]
526 output[N-i] = conjugate(output[i])
527 output[3*N/4-i] = conjugate(output[i+N/4])
529 # This function performs one real DFT of size N/2 and two real DFT of size
530 # N/4, followed by 6 real additions, 2 real multiplies, 3*N/4-4 complex
531 # additions and N/4-2 complex multiplies.
534 # We can also try to combine the two real DFT of size N/4 into a single complex
535 # DFT :
537 def real_unscaled_DFT_split_radix_time_2 (N, input, output):
538 even_input = vector(N/2)
539 even_output = vector(N/2)
540 odd_input = vector(N/4)
541 odd_output = vector(N/4)
542 tmp_0 = vector(N/4)
543 tmp_1 = vector(N/4)
545 for i in range(N/2):
546 even_input[i] = input[2*i]
548 for i in range(N/4):
549 odd_input[i] = input[4*i+1] + 1j * input[4*i+3]
551 unscaled_DFT (N/2, even_input, even_output)
552 # this is again a real DFT !
553 # we will only use even_output[i] for i in 0 ... N/4 included. we know that
554 # even_output[N/2-i] is the conjugate of even_output[i]... also we know
555 # that even_output[0] and even_output[N/4] are purely real.
557 unscaled_DFT (N/4, odd_input, odd_output)
558 # but this one is a complex DFT so no special properties here
560 output_1 = odd_output[0].real
561 output_3 = odd_output[0].imag
562 tmp_0[0] = output_1 + output_3 # real numbers
563 tmp_1[0] = output_1 - output_3 # real numbers
565 output_1 = odd_output[N/8].real
566 output_3 = odd_output[N/8].imag
567 tmp__0 = (output_1 + output_3) * sqrt(0.5) # real numbers
568 tmp__1 = (output_1 - output_3) * sqrt(0.5) # real numbers
569 tmp_0[N/8] = tmp__1 - 1j * tmp__0 # real + 1j * real
570 tmp_1[N/8] = tmp__0 - 1j * tmp__1 # real + 1j * real
572 for i in range(N/8)[1:]:
573 output_1 = odd_output[i] + conjugate(odd_output[N/4-i])
574 output_3 = odd_output[i] - conjugate(odd_output[N/4-i])
576 output_1 = output_1 * 0.5 * W (i, N)
577 output_3 = output_3 * -0.5j * W (3*i, N)
579 tmp_0[i] = output_1 + output_3
580 tmp_1[i] = output_1 - output_3
582 tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
583 tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])
585 output[0] = even_output[0] + tmp_0[0] # real numbers
586 output[N/4] = even_output[N/4] - 1j * tmp_1[0] # real + 1j * real
587 output[N/2] = even_output[0] - tmp_0[0] # real numbers
588 output[3*N/4] = even_output[N/4] + 1j * tmp_1[0] # real + 1j * real
590 for i in range(N/4)[1:]:
591 output[i] = even_output[i] + tmp_0[i]
592 output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]
594 output[N-i] = conjugate(output[i])
595 output[3*N/4-i] = conjugate(output[i+N/4])
597 # This function performs one real DFT of size N/2 and one complex DFT of size
598 # N/4, followed by 6 real additions, 2 real multiplies, N-6 complex additions
599 # and N/4-2 complex multiplies.
601 # After comparing the performance, it turns out that for real-valued DFT, the
602 # version of the algorithm that subdivides the calculation into one real
603 # DFT of size N/2 and two real DFT of size N/4 is the most efficient one.
604 # The other version gives exactly the same number of multiplies and a few more
605 # real additions.
608 # Now we can also try the decimate-in-frequency method for a real-valued DFT.
609 # Using the split-radix algorithm, and by taking into account the symetries of
610 # the outputs :
612 def real_unscaled_DFT_split_radix_freq (N, input, output):
613 even_input = vector(N/2)
614 input_1 = vector(N/4)
615 even_output = vector(N/2)
616 output_1 = vector(N/4)
617 tmp_0 = vector(N/4)
618 tmp_1 = vector(N/4)
620 for i in range(N/2):
621 even_input[i] = input[i] + input[i+N/2]
623 for i in range(N/4):
624 tmp_0[i] = input[i] - input[i+N/2]
625 tmp_1[i] = input[i+N/4] - input[i+3*N/4]
627 for i in range(N/4):
628 input_1[i] = tmp_0[i] - 1j * tmp_1[i]
630 for i in range(N/4):
631 input_1[i] = input_1[i] * W (i, N)
633 unscaled_DFT (N/2, even_input, even_output)
634 # This is still a real-valued DFT
636 unscaled_DFT (N/4, input_1, output_1)
637 # But that one is a complex-valued DFT
639 for i in range(N/2):
640 output[2*i] = even_output[i]
642 for i in range(N/4):
643 output[4*i+1] = output_1[i]
644 output[N-1-4*i] = conjugate(output_1[i])
646 # I think this implementation is much more elegant than the decimate-in-time
647 # version ! It looks very much like the complex-valued version, all we had to
648 # do was remove one of the complex-valued internal DFT calls because we could
649 # deduce the outputs by using the symetries of the problem.
651 # As for performance, we did N real additions, N/4 complex multiplies (a bit
652 # less actually, because W(0,N) = 1 and W(N/8,N) is a "simple" multiply), then
653 # one real DFT of size N/2 and one complex DFT of size N/4.
655 # It turns out that even if the methods are so different, the number of
656 # operations is exactly the same as for the best of the two decimation-in-time
657 # methods that we tried.
660 # This gives us the following performance for real-valued DFT :
661 # N real additions real multiplies complex multiplies
662 # 2 2 0 0
663 # 4 6 0 0
664 # 8 20 2 0
665 # 16 54 4 2
666 # 32 140 10 8
667 # 64 342 20 26
668 # 128 812 42 72
669 # 256 1878 84 186
670 # 512 4268 170 456
671 # 1024 9558 340 1082
672 # 2048 21164 682 2504
673 # 4096 46422 1364 5690
674 # 8192 101036 2730 12744
675 # 16384 218454 5460 28218
676 # 32768 469676 10922 61896
677 # 65536 1004886 21844 134714
680 # As an example, this is an implementation of the real-valued DFT8 :
682 def DFT8 (input, output):
683 even_0 = input[0] + input[4]
684 even_1 = input[1] + input[5]
685 even_2 = input[2] + input[6]
686 even_3 = input[3] + input[7]
688 tmp_0 = even_0 + even_2
689 tmp_1 = even_0 - even_2
690 tmp_2 = even_1 + even_3
691 tmp_3 = even_1 - even_3
693 output[0] = tmp_0 + tmp_2
694 output[2] = tmp_1 - 1j * tmp_3
695 output[4] = tmp_0 - tmp_2
697 odd_0_r = input[0] - input[4]
698 odd_0_i = input[2] - input[6]
700 tmp_0 = input[1] - input[5]
701 tmp_1 = input[3] - input[7]
702 odd_1_r = (tmp_0 - tmp_1) * sqrt(0.5)
703 odd_1_i = (tmp_0 + tmp_1) * sqrt(0.5)
705 output[1] = (odd_0_r + odd_1_r) - 1j * (odd_0_i + odd_1_i)
706 output[5] = (odd_0_r - odd_1_r) - 1j * (odd_0_i - odd_1_i)
708 output[3] = conjugate(output[5])
709 output[6] = conjugate(output[2])
710 output[7] = conjugate(output[1])
713 # Also a basic implementation of the real-valued DFT4 :
715 def DFT4 (input, output):
716 tmp_0 = input[0] + input[2]
717 tmp_1 = input[0] - input[2]
718 tmp_2 = input[1] + input[3]
719 tmp_3 = input[1] - input[3]
721 output[0] = tmp_0 + tmp_2
722 output[1] = tmp_1 - 1j * tmp_3
723 output[2] = tmp_0 - tmp_2
724 output[3] = tmp_1 + 1j * tmp_3
727 # A similar idea might be used to calculate only the real part of the output
728 # of a complex DFT : we take an DFT algorithm for real inputs and complex
729 # outputs and we simply reverse it. The resulting algorithm will only work
730 # with inputs that satisfy the conjugaison rule (input[i] is the conjugate of
731 # input[N-i]) so we can do a first pass to modify the input so that it follows
732 # this rule. An example implementation is as follows (adapted from the
733 # unscaled_DFT_split_radix_time algorithm) :
735 def complex2real_unscaled_DFT_split_radix_time (N, input, output):
736 even_input = vector(N/2)
737 input_1 = vector(N/4)
738 even_output = vector(N/2)
739 output_1 = vector(N/4)
741 for i in range(N/2):
742 even_input[i] = input[2*i]
744 for i in range(N/4):
745 input_1[i] = input[4*i+1] + conjugate(input[N-1-4*i])
747 unscaled_DFT (N/2, even_input, even_output)
748 unscaled_DFT (N/4, input_1, output_1)
750 for i in range(N/4):
751 output_1[i] = output_1[i] * W (i, N)
753 for i in range(N/4):
754 output[i] = even_output[i] + output_1[i].real
755 output[i+N/4] = even_output[i+N/4] + output_1[i].imag
756 output[i+N/2] = even_output[i] - output_1[i].real
757 output[i+3*N/4] = even_output[i+N/4] - output_1[i].imag
759 # This algorithm does N/4 complex additions, N/4-1 complex multiplies
760 # (including one "simple" multiply for i=N/8), N real additions, one
761 # "complex-to-real" DFT of size N/2, and one complex DFT of size N/4.
762 # Also, in the complex DFT of size N/4, we do not care about the imaginary
763 # part of output_1[0], which in practice allows us to save one real addition.
765 # This gives us the following performance for complex DFT with real outputs :
766 # N real additions real multiplies complex multiplies
767 # 1 0 0 0
768 # 2 2 0 0
769 # 4 8 0 0
770 # 8 25 2 0
771 # 16 66 4 2
772 # 32 167 10 8
773 # 64 400 20 26
774 # 128 933 42 72
775 # 256 2126 84 186
776 # 512 4771 170 456
777 # 1024 10572 340 1082
778 # 2048 23201 682 2504
779 # 4096 50506 1364 5690
780 # 8192 109215 2730 12744
781 # 16384 234824 5460 28218
782 # 32768 502429 10922 61896
783 # 65536 1070406 21844 134714
786 # Now let's talk about the DCT algorithm. The canonical definition for it is
787 # as follows :
789 def C (k, N):
790 return cos ((k*pi)/(2*N))
792 def unscaled_DCT (N, input, output):
793 for o in range(N): # o is output index
794 output[o] = 0
795 for i in range(N): # i is input index
796 output[o] = output[o] + input[i] * C ((2*i+1)*o, N)
798 # This trivial algorithm uses N*N multiplications and N*(N-1) additions.
801 # One possible decomposition on this calculus is to use the fact that C (i, N)
802 # and C (2*N-i, N) are opposed. This can lead to this decomposition :
804 #def unscaled_DCT (N, input, output):
805 # even_input = vector (N)
806 # odd_input = vector (N)
807 # even_output = vector (N)
808 # odd_output = vector (N)
810 # for i in range(N/2):
811 # even_input[i] = input[i] + input[N-1-i]
812 # odd_input[i] = input[i] - input[N-1-i]
814 # unscaled_DCT (N, even_input, even_output)
815 # unscaled_DCT (N, odd_input, odd_output)
817 # for i in range(N/2):
818 # output[2*i] = even_output[2*i]
819 # output[2*i+1] = odd_output[2*i+1]
821 # Now the even part can easily be calculated : by looking at the C(k,N)
822 # formula, we see that the even part is actually an unscaled DCT of size N/2.
823 # The odd part looks like a DCT of size N/2, but the coefficients are
824 # actually C ((2*i+1)*(2*o+1), 2*N) instead of C ((2*i+1)*o, N).
826 # We use a trigonometric relation here :
827 # 2 * C ((a+b)/2, N) * C ((a-b)/2, N) = C (a, N) + C (b, N)
828 # Thus with a = (2*i+1)*o and b = (2*i+1)*(o+1) :
829 # 2 * C((2*i+1)*(2*o+1),2N) * C(2*i+1,2N) = C((2*i+1)*o,N) + C((2*i+1)*(o+1),N)
831 # This leads us to the Lee DCT algorithm :
833 def unscaled_DCT_Lee (N, input, output):
834 even_input = vector(N/2)
835 odd_input = vector(N/2)
836 even_output = vector(N/2)
837 odd_output = vector(N/2)
839 for i in range(N/2):
840 even_input[i] = input[i] + input[N-1-i]
841 odd_input[i] = input[i] - input[N-1-i]
843 for i in range(N/2):
844 odd_input[i] = odd_input[i] * (0.5 / C (2*i+1, N))
846 unscaled_DCT (N/2, even_input, even_output)
847 unscaled_DCT (N/2, odd_input, odd_output)
849 for i in range(N/2-1):
850 odd_output[i] = odd_output[i] + odd_output[i+1]
852 for i in range(N/2):
853 output[2*i] = even_output[i]
854 output[2*i+1] = odd_output[i];
856 # Notes about this algorithm :
858 # The algorithm can be easily inverted to calculate the IDCT instead :
859 # each of the basic stages are separately inversible...
861 # This function does N adds, then N/2 muls, then 2 recursive calls with
862 # size N/2, then N/2-1 adds again. If we apply it recursively, the total
863 # number of operations will be N*log2(N)/2 multiplies and N*(3*log2(N)/2-1) + 1
864 # additions. So this is much faster than the canonical algorithm.
866 # Some of the multiplication coefficients 0.5/cos(...) can get quite large.
867 # This means that a small error in the input will give a large error on the
868 # output... For a DCT of size N the biggest coefficient will be at i=N/2-1
869 # and it will be slighly more than N/pi which can be large for large N's.
871 # In the IDCT however, the multiplication coefficients for the reverse
872 # transformation are of the form 2*cos(...) so they can not get big and there
873 # is no accuracy problem.
875 # You can find another description of this algorithm at
876 # http://www.intel.com/drg/mmx/appnotes/ap533.htm
880 # Another idea is to observe that the DCT calculation can be made to look like
881 # the DFT calculation : C (k, N) is the real part of W (k, 4*N) or W (-k, 4*N).
882 # We can use this idea translate the DCT algorithm into a call to the DFT
883 # algorithm :
885 def unscaled_DCT_DFT (N, input, output):
886 DFT_input = vector (4*N)
887 DFT_output = vector (4*N)
889 for i in range(N):
890 DFT_input[2*i+1] = input[i]
891 #DFT_input[4*N-2*i-1] = input[i] # We could use this instead
893 unscaled_DFT (4*N, DFT_input, DFT_output)
895 for i in range(N):
896 output[i] = DFT_output[i].real
899 # We can then use our knowledge of the DFT calculation to optimize for this
900 # particular case. For example using the radix-2 decimation-in-time method :
902 #def unscaled_DCT_DFT (N, input, output):
903 # DFT_input = vector (2*N)
904 # DFT_output = vector (2*N)
906 # for i in range(N):
907 # DFT_input[i] = input[i]
908 # #DFT_input[2*N-1-i] = input[i] # We could use this instead
910 # unscaled_DFT (2*N, DFT_input, DFT_output)
912 # for i in range(N):
913 # DFT_output[i] = DFT_output[i] * W (i, 4*N)
915 # for i in range(N):
916 # output[i] = DFT_output[i].real
918 # This leads us to the AAN implementation of the DCT algorithm : if we set
919 # both DFT_input[i] and DFT_input[2*N-1-i] to input[i], then the imaginary
920 # parts of W(2*i+1) and W(-2*i-1) will compensate, and output_DFT[i] will
921 # already be a real after the multiplication by W(i,4*N). Which means that
922 # before the multiplication, it is the product of a real number and W(-i,4*N).
923 # This leads to the following code, called the AAN algorithm :
925 def unscaled_DCT_AAN (N, input, output):
926 DFT_input = vector (2*N)
927 DFT_output = vector (2*N)
929 for i in range(N):
930 DFT_input[i] = input[i]
931 DFT_input[2*N-1-i] = input[i]
933 symetrical_unscaled_DFT (2*N, DFT_input, DFT_output)
935 for i in range(N):
936 output[i] = DFT_output[i].real * (0.5 / C (i, N))
938 # Notes about the AAN algorithm :
940 # The cost of this function is N real multiplies and a DFT of size 2*N. The
941 # DFT to calculate has special properties : the inputs are real and symmetric.
942 # Also, we only need to calculate the real parts of the N first DFT outputs.
943 # We can try to take advantage of all that.
945 # We can invert this algorithm to calculate the IDCT. The final multiply
946 # stage is trivially invertible. The DFT stage is invertible too, but we have
947 # to take into account the special properties of this particular DFT for that.
949 # Once again we have to take care of numerical precision for the DFT : the
950 # output coefficients can get large, so that a small error in the input will
951 # give a large error on the output... For a DCT of size N the biggest
952 # coefficient will be at i=N/2-1 and it will be slightly more than N/pi
954 # You can find another description of this algorithm at this url :
955 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fastdct.html
956 # (It is the same server where we already found a description of the fast DFT)
959 # To optimize the DFT calculation, we can take a lot of specific things into
960 # account : the input is real and symetric, and we only care about the real
961 # part of the output. Also, we only care about the N first output coefficients,
962 # but that one does not save operations actually, because the other
963 # coefficients are the conjugates of the ones we look anyway.
965 # One useful way to use the symetry of the input is to use the radix-2
966 # decimation-in-frequency algorithm. We can write a version of
967 # unscaled_DFT_radix2_freq for the case where the input is symetrical :
968 # we have removed a few additions in the first stages because even_input
969 # is symetrical and odd_input is antisymetrical. Also, we have modified the
970 # odd_input vector so that the second half of it is set to zero and the real
971 # part of the DFT output is not modified. After that modification, the second
972 # part of the odd_input was null so we used the radix-2 decimation-in-frequency
973 # again on the odd DFT. Also odd_output is symetrical because input is real...
975 def symetrical_unscaled_DFT (N, input, output):
976 even_input = vector(N/2)
977 odd_tmp = vector(N/2)
978 odd_input = vector(N/2)
979 even_output = vector(N/2)
980 odd_output = vector(N/2)
982 for i in range(N/4):
983 even_input[N/2-i-1] = even_input[i] = input[i] + input[N/2-1-i]
985 for i in range(N/4):
986 odd_tmp[i] = input[i] - input[N/2-1-i]
988 odd_input[0] = odd_tmp[0]
989 for i in range(N/4)[1:]:
990 odd_input[i] = (odd_tmp[i] + odd_tmp[i-1]) * W (i, N)
992 unscaled_DFT (N/2, even_input, even_output)
993 # symetrical real inputs, real outputs
995 unscaled_DFT (N/4, odd_input, odd_output)
996 # complex inputs, real outputs
998 for i in range(N/2):
999 output[2*i] = even_output[i]
1001 for i in range(N/4):
1002 output[N-1-4*i] = output[4*i+1] = odd_output[i]
1004 # This procedure takes 3*N/4-1 real additions and N/2-3 real multiplies,
1005 # followed by another symetrical real DFT of size N/2 and a "complex to real"
1006 # DFT of size N/4.
1008 # We thus get the following performance results :
1009 # N real additions real multiplies complex multiplies
1010 # 1 0 0 0
1011 # 2 0 0 0
1012 # 4 2 0 0
1013 # 8 9 1 0
1014 # 16 28 6 0
1015 # 32 76 21 0
1016 # 64 189 54 2
1017 # 128 451 125 10
1018 # 256 1042 270 36
1019 # 512 2358 565 108
1020 # 1024 5251 1158 294
1021 # 2048 11557 2349 750
1022 # 4096 25200 4734 1832
1023 # 8192 54544 9509 4336
1024 # 16384 117337 19062 10026
1025 # 32768 251127 38173 22770
1026 # 65536 535102 76398 50988
1029 # We thus get a better performance with the AAN DCT algorithm than with the
1030 # Lee DCT algorithm : we can do a DCT of size 32 with 189 additions, 54+32 real
1031 # multiplies, and 2 complex multiplies. The Lee algorithm would have used 209
1032 # additions and 80 multiplies. With the AAN algorithm, we also have the
1033 # advantage that a big number of the multiplies are actually grouped at the
1034 # output stage of the algorithm, so if we want to do a DCT followed by a
1035 # quantization stage, we will be able to group the multiply of the output with
1036 # the multiply of the quantization stage, thus saving 32 more operations. In
1037 # the mpeg audio layer 1 or 2 processing, we can also group the multiply of the
1038 # output with the multiply of the convolution stage...
1040 # Another source code for the AAN algorithm (implemented on 8 points, and
1041 # without all of the explanations) can be found at this URL :
1042 # http://developer.intel.com/drg/pentiumII/appnotes/aan_org.c . This
1043 # implementation uses 28 adds and 6+8 muls instead of 29 adds and 5+8 muls -
1044 # the difference is that in the symetrical_unscaled_DFT procedure, they noticed
1045 # how odd_input[i] and odd_input[N/4-i] will be combined at the start of the
1046 # complex-to-real DFT and they took advantage of this to convert 2 real adds
1047 # and 4 real muls into one complex multiply.
1050 # TODO : write about multi-dimentional DCT
1053 # TEST CODE
1055 def dump (vector):
1056 str = ""
1057 for i in range(len(vector)):
1058 if i:
1059 str = str + ", "
1060 vector[i] = vector[i] + 0j
1061 realstr = "%+.4f" % vector[i].real
1062 imagstr = "%+.4fj" % vector[i].imag
1063 if (realstr == "-0.0000"):
1064 realstr = "+0.0000"
1065 if (imagstr == "-0.0000j"):
1066 imagstr = "+0.0000j"
1067 str = str + realstr #+ imagstr
1068 return "[%s]" % str
1070 import whrandom
1072 def test(N):
1073 input = vector(N)
1074 output = vector(N)
1075 verify = vector(N)
1077 for i in range(N):
1078 input[i] = whrandom.random() + 1j * whrandom.random()
1080 unscaled_DFT (N, input, output)
1081 unscaled_DFT (N, input, verify)
1083 if (dump(output) != dump(verify)):
1084 print dump(output)
1085 print dump(verify)
1087 #test (64)
1090 # PERFORMANCE ANALYSIS CODE
1092 def display (table):
1093 N = 1
1094 print "#\tN\treal additions\treal multiplies\tcomplex multiplies"
1095 while table.has_key(N):
1096 print "#%8d%16d%16d%16d" % (N, table[N][0], table[N][1], table[N][2])
1097 N = 2*N
1098 print
1100 best_complex_DFT = {}
1102 def complex_DFT (max_N):
1103 best_complex_DFT[1] = (0,0,0)
1104 best_complex_DFT[2] = (4,0,0)
1105 best_complex_DFT[4] = (16,0,0)
1106 N = 8
1107 while (N<=max_N):
1108 # best method = split radix
1109 best2 = best_complex_DFT[N/2]
1110 best4 = best_complex_DFT[N/4]
1111 best_complex_DFT[N] = (best2[0] + 2*best4[0] + 3*N + 4,
1112 best2[1] + 2*best4[1] + 4,
1113 best2[2] + 2*best4[2] + N/2 - 4)
1114 N = 2*N
1116 best_real_DFT = {}
1118 def real_DFT (max_N):
1119 best_real_DFT[1] = (0,0,0)
1120 best_real_DFT[2] = (2,0,0)
1121 best_real_DFT[4] = (6,0,0)
1122 N = 8
1123 while (N<=max_N):
1124 # best method = split radix decimate-in-frequency
1125 best2 = best_real_DFT[N/2]
1126 best4 = best_complex_DFT[N/4]
1127 best_real_DFT[N] = (best2[0] + best4[0] + N + 2,
1128 best2[1] + best4[1] + 2,
1129 best2[2] + best4[2] + N/4 - 2)
1130 N = 2*N
1132 best_complex2real_DFT = {}
1134 def complex2real_DFT (max_N):
1135 best_complex2real_DFT[1] = (0,0,0)
1136 best_complex2real_DFT[2] = (2,0,0)
1137 best_complex2real_DFT[4] = (8,0,0)
1138 N = 8
1139 while (N<=max_N):
1140 best2 = best_complex2real_DFT[N/2]
1141 best4 = best_complex_DFT[N/4]
1142 best_complex2real_DFT[N] = (best2[0] + best4[0] + 3*N/2 + 1,
1143 best2[1] + best4[1] + 2,
1144 best2[2] + best4[2] + N/4 - 2)
1145 N = 2*N
1147 best_real_symetric_DFT = {}
1149 def real_symetric_DFT (max_N):
1150 best_real_symetric_DFT[1] = (0,0,0)
1151 best_real_symetric_DFT[2] = (0,0,0)
1152 best_real_symetric_DFT[4] = (2,0,0)
1153 N = 8
1154 while (N<=max_N):
1155 best2 = best_real_symetric_DFT[N/2]
1156 best4 = best_complex2real_DFT[N/4]
1157 best_real_symetric_DFT[N] = (best2[0] + best4[0] + 3*N/4 - 1,
1158 best2[1] + best4[1] + N/2 - 3,
1159 best2[2] + best4[2])
1160 N = 2*N
1162 complex_DFT (65536)
1163 real_DFT (65536)
1164 complex2real_DFT (65536)
1165 real_symetric_DFT (65536)
1168 print "complex DFT"
1169 display (best_complex_DFT)
1171 print "real DFT"
1172 display (best_real_DFT)
1174 print "complex2real DFT"
1175 display (best_complex2real_DFT)
1177 print "real symetric DFT"
1178 display (best_real_symetric_DFT)