4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * @file libavcodec/simple_idct.c
29 based upon some outcommented c code from mpeg2dec (idct_mmx.c
30 written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
35 #include "simple_idct.h"
38 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
39 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
40 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
41 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
42 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
43 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
44 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
48 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
52 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
54 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
56 #define COL_SHIFT 20 // 6
59 static inline void idctRowCondDC (DCTELEM
* row
)
61 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
70 #define ROW0_MASK 0xffff000000000000LL
72 #define ROW0_MASK 0xffffLL
74 if(sizeof(DCTELEM
)==2){
75 if ( ((((uint64_t *)row
)[0] & ~ROW0_MASK
) |
76 ((uint64_t *)row
)[1]) == 0) {
77 temp
= (row
[0] << 3) & 0xffff;
80 ((uint64_t *)row
)[0] = temp
;
81 ((uint64_t *)row
)[1] = temp
;
85 if (!(row
[1]|row
[2]|row
[3]|row
[4]|row
[5]|row
[6]|row
[7])) {
86 row
[0]=row
[1]=row
[2]=row
[3]=row
[4]=row
[5]=row
[6]=row
[7]= row
[0] << 3;
91 if(sizeof(DCTELEM
)==2){
92 if (!(((uint32_t*)row
)[1] |
96 temp
= (row
[0] << 3) & 0xffff;
98 ((uint32_t*)row
)[0]=((uint32_t*)row
)[1] =
99 ((uint32_t*)row
)[2]=((uint32_t*)row
)[3] = temp
;
103 if (!(row
[1]|row
[2]|row
[3]|row
[4]|row
[5]|row
[6]|row
[7])) {
104 row
[0]=row
[1]=row
[2]=row
[3]=row
[4]=row
[5]=row
[6]=row
[7]= row
[0] << 3;
110 a0
= (W4
* row
[0]) + (1 << (ROW_SHIFT
- 1));
115 /* no need to optimize : gcc does it */
121 b0
= MUL16(W1
, row
[1]);
122 MAC16(b0
, W3
, row
[3]);
123 b1
= MUL16(W3
, row
[1]);
124 MAC16(b1
, -W7
, row
[3]);
125 b2
= MUL16(W5
, row
[1]);
126 MAC16(b2
, -W1
, row
[3]);
127 b3
= MUL16(W7
, row
[1]);
128 MAC16(b3
, -W5
, row
[3]);
131 temp
= ((uint64_t*)row
)[1];
133 temp
= ((uint32_t*)row
)[2] | ((uint32_t*)row
)[3];
136 a0
+= W4
*row
[4] + W6
*row
[6];
137 a1
+= - W4
*row
[4] - W2
*row
[6];
138 a2
+= - W4
*row
[4] + W2
*row
[6];
139 a3
+= W4
*row
[4] - W6
*row
[6];
141 MAC16(b0
, W5
, row
[5]);
142 MAC16(b0
, W7
, row
[7]);
144 MAC16(b1
, -W1
, row
[5]);
145 MAC16(b1
, -W5
, row
[7]);
147 MAC16(b2
, W7
, row
[5]);
148 MAC16(b2
, W3
, row
[7]);
150 MAC16(b3
, W3
, row
[5]);
151 MAC16(b3
, -W1
, row
[7]);
154 row
[0] = (a0
+ b0
) >> ROW_SHIFT
;
155 row
[7] = (a0
- b0
) >> ROW_SHIFT
;
156 row
[1] = (a1
+ b1
) >> ROW_SHIFT
;
157 row
[6] = (a1
- b1
) >> ROW_SHIFT
;
158 row
[2] = (a2
+ b2
) >> ROW_SHIFT
;
159 row
[5] = (a2
- b2
) >> ROW_SHIFT
;
160 row
[3] = (a3
+ b3
) >> ROW_SHIFT
;
161 row
[4] = (a3
- b3
) >> ROW_SHIFT
;
164 static inline void idctSparseColPut (uint8_t *dest
, int line_size
,
167 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
168 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
170 /* XXX: I did that only to give same values as previous code */
171 a0
= W4
* (col
[8*0] + ((1<<(COL_SHIFT
-1))/W4
));
181 b0
= MUL16(W1
, col
[8*1]);
182 b1
= MUL16(W3
, col
[8*1]);
183 b2
= MUL16(W5
, col
[8*1]);
184 b3
= MUL16(W7
, col
[8*1]);
186 MAC16(b0
, + W3
, col
[8*3]);
187 MAC16(b1
, - W7
, col
[8*3]);
188 MAC16(b2
, - W1
, col
[8*3]);
189 MAC16(b3
, - W5
, col
[8*3]);
199 MAC16(b0
, + W5
, col
[8*5]);
200 MAC16(b1
, - W1
, col
[8*5]);
201 MAC16(b2
, + W7
, col
[8*5]);
202 MAC16(b3
, + W3
, col
[8*5]);
213 MAC16(b0
, + W7
, col
[8*7]);
214 MAC16(b1
, - W5
, col
[8*7]);
215 MAC16(b2
, + W3
, col
[8*7]);
216 MAC16(b3
, - W1
, col
[8*7]);
219 dest
[0] = cm
[(a0
+ b0
) >> COL_SHIFT
];
221 dest
[0] = cm
[(a1
+ b1
) >> COL_SHIFT
];
223 dest
[0] = cm
[(a2
+ b2
) >> COL_SHIFT
];
225 dest
[0] = cm
[(a3
+ b3
) >> COL_SHIFT
];
227 dest
[0] = cm
[(a3
- b3
) >> COL_SHIFT
];
229 dest
[0] = cm
[(a2
- b2
) >> COL_SHIFT
];
231 dest
[0] = cm
[(a1
- b1
) >> COL_SHIFT
];
233 dest
[0] = cm
[(a0
- b0
) >> COL_SHIFT
];
236 static inline void idctSparseColAdd (uint8_t *dest
, int line_size
,
239 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
240 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
242 /* XXX: I did that only to give same values as previous code */
243 a0
= W4
* (col
[8*0] + ((1<<(COL_SHIFT
-1))/W4
));
253 b0
= MUL16(W1
, col
[8*1]);
254 b1
= MUL16(W3
, col
[8*1]);
255 b2
= MUL16(W5
, col
[8*1]);
256 b3
= MUL16(W7
, col
[8*1]);
258 MAC16(b0
, + W3
, col
[8*3]);
259 MAC16(b1
, - W7
, col
[8*3]);
260 MAC16(b2
, - W1
, col
[8*3]);
261 MAC16(b3
, - W5
, col
[8*3]);
271 MAC16(b0
, + W5
, col
[8*5]);
272 MAC16(b1
, - W1
, col
[8*5]);
273 MAC16(b2
, + W7
, col
[8*5]);
274 MAC16(b3
, + W3
, col
[8*5]);
285 MAC16(b0
, + W7
, col
[8*7]);
286 MAC16(b1
, - W5
, col
[8*7]);
287 MAC16(b2
, + W3
, col
[8*7]);
288 MAC16(b3
, - W1
, col
[8*7]);
291 dest
[0] = cm
[dest
[0] + ((a0
+ b0
) >> COL_SHIFT
)];
293 dest
[0] = cm
[dest
[0] + ((a1
+ b1
) >> COL_SHIFT
)];
295 dest
[0] = cm
[dest
[0] + ((a2
+ b2
) >> COL_SHIFT
)];
297 dest
[0] = cm
[dest
[0] + ((a3
+ b3
) >> COL_SHIFT
)];
299 dest
[0] = cm
[dest
[0] + ((a3
- b3
) >> COL_SHIFT
)];
301 dest
[0] = cm
[dest
[0] + ((a2
- b2
) >> COL_SHIFT
)];
303 dest
[0] = cm
[dest
[0] + ((a1
- b1
) >> COL_SHIFT
)];
305 dest
[0] = cm
[dest
[0] + ((a0
- b0
) >> COL_SHIFT
)];
308 static inline void idctSparseCol (DCTELEM
* col
)
310 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
312 /* XXX: I did that only to give same values as previous code */
313 a0
= W4
* (col
[8*0] + ((1<<(COL_SHIFT
-1))/W4
));
323 b0
= MUL16(W1
, col
[8*1]);
324 b1
= MUL16(W3
, col
[8*1]);
325 b2
= MUL16(W5
, col
[8*1]);
326 b3
= MUL16(W7
, col
[8*1]);
328 MAC16(b0
, + W3
, col
[8*3]);
329 MAC16(b1
, - W7
, col
[8*3]);
330 MAC16(b2
, - W1
, col
[8*3]);
331 MAC16(b3
, - W5
, col
[8*3]);
341 MAC16(b0
, + W5
, col
[8*5]);
342 MAC16(b1
, - W1
, col
[8*5]);
343 MAC16(b2
, + W7
, col
[8*5]);
344 MAC16(b3
, + W3
, col
[8*5]);
355 MAC16(b0
, + W7
, col
[8*7]);
356 MAC16(b1
, - W5
, col
[8*7]);
357 MAC16(b2
, + W3
, col
[8*7]);
358 MAC16(b3
, - W1
, col
[8*7]);
361 col
[0 ] = ((a0
+ b0
) >> COL_SHIFT
);
362 col
[8 ] = ((a1
+ b1
) >> COL_SHIFT
);
363 col
[16] = ((a2
+ b2
) >> COL_SHIFT
);
364 col
[24] = ((a3
+ b3
) >> COL_SHIFT
);
365 col
[32] = ((a3
- b3
) >> COL_SHIFT
);
366 col
[40] = ((a2
- b2
) >> COL_SHIFT
);
367 col
[48] = ((a1
- b1
) >> COL_SHIFT
);
368 col
[56] = ((a0
- b0
) >> COL_SHIFT
);
371 void ff_simple_idct_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
375 idctRowCondDC(block
+ i
*8);
378 idctSparseColPut(dest
+ i
, line_size
, block
+ i
);
381 void ff_simple_idct_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
385 idctRowCondDC(block
+ i
*8);
388 idctSparseColAdd(dest
+ i
, line_size
, block
+ i
);
391 void ff_simple_idct(DCTELEM
*block
)
395 idctRowCondDC(block
+ i
*8);
398 idctSparseCol(block
+ i
);
404 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
405 #define C1 C_FIX(0.6532814824)
406 #define C2 C_FIX(0.2705980501)
408 /* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
409 and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
410 #define C_SHIFT (4+1+12)
412 static inline void idct4col_put(uint8_t *dest
, int line_size
, const DCTELEM
*col
)
414 int c0
, c1
, c2
, c3
, a0
, a1
, a2
, a3
;
415 const uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
421 c0
= ((a0
+ a2
) << (CN_SHIFT
- 1)) + (1 << (C_SHIFT
- 1));
422 c2
= ((a0
- a2
) << (CN_SHIFT
- 1)) + (1 << (C_SHIFT
- 1));
423 c1
= a1
* C1
+ a3
* C2
;
424 c3
= a1
* C2
- a3
* C1
;
425 dest
[0] = cm
[(c0
+ c1
) >> C_SHIFT
];
427 dest
[0] = cm
[(c2
+ c3
) >> C_SHIFT
];
429 dest
[0] = cm
[(c2
- c3
) >> C_SHIFT
];
431 dest
[0] = cm
[(c0
- c1
) >> C_SHIFT
];
440 ptr[8 + k] = a0 - a1;\
443 /* only used by DV codec. The input must be interlaced. 128 is added
444 to the pixels before clamping to avoid systematic error
445 (1024*sqrt(2)) offset would be needed otherwise. */
446 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
447 compensate the extra butterfly stage - I don't have the full DV
449 void ff_simple_idct248_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
468 /* IDCT8 on each line */
470 idctRowCondDC(block
+ i
*8);
473 /* IDCT4 and store */
475 idct4col_put(dest
+ i
, 2 * line_size
, block
+ i
);
476 idct4col_put(dest
+ line_size
+ i
, 2 * line_size
, block
+ 8 + i
);
480 /* 8x4 & 4x8 WMV2 IDCT */
487 #define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
488 #define C1 C_FIX(0.6532814824)
489 #define C2 C_FIX(0.2705980501)
490 #define C3 C_FIX(0.5)
491 #define C_SHIFT (4+1+12)
492 static inline void idct4col_add(uint8_t *dest
, int line_size
, const DCTELEM
*col
)
494 int c0
, c1
, c2
, c3
, a0
, a1
, a2
, a3
;
495 const uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
501 c0
= (a0
+ a2
)*C3
+ (1 << (C_SHIFT
- 1));
502 c2
= (a0
- a2
)*C3
+ (1 << (C_SHIFT
- 1));
503 c1
= a1
* C1
+ a3
* C2
;
504 c3
= a1
* C2
- a3
* C1
;
505 dest
[0] = cm
[dest
[0] + ((c0
+ c1
) >> C_SHIFT
)];
507 dest
[0] = cm
[dest
[0] + ((c2
+ c3
) >> C_SHIFT
)];
509 dest
[0] = cm
[dest
[0] + ((c2
- c3
) >> C_SHIFT
)];
511 dest
[0] = cm
[dest
[0] + ((c0
- c1
) >> C_SHIFT
)];
515 #define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
516 #define R1 R_FIX(0.6532814824)
517 #define R2 R_FIX(0.2705980501)
518 #define R3 R_FIX(0.5)
520 static inline void idct4row(DCTELEM
*row
)
522 int c0
, c1
, c2
, c3
, a0
, a1
, a2
, a3
;
523 //const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
529 c0
= (a0
+ a2
)*R3
+ (1 << (R_SHIFT
- 1));
530 c2
= (a0
- a2
)*R3
+ (1 << (R_SHIFT
- 1));
531 c1
= a1
* R1
+ a3
* R2
;
532 c3
= a1
* R2
- a3
* R1
;
533 row
[0]= (c0
+ c1
) >> R_SHIFT
;
534 row
[1]= (c2
+ c3
) >> R_SHIFT
;
535 row
[2]= (c2
- c3
) >> R_SHIFT
;
536 row
[3]= (c0
- c1
) >> R_SHIFT
;
539 void ff_simple_idct84_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
543 /* IDCT8 on each line */
545 idctRowCondDC(block
+ i
*8);
548 /* IDCT4 and store */
550 idct4col_add(dest
+ i
, line_size
, block
+ i
);
554 void ff_simple_idct48_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
558 /* IDCT4 on each line */
560 idct4row(block
+ i
*8);
563 /* IDCT8 and store */
565 idctSparseColAdd(dest
+ i
, line_size
, block
+ i
);
569 void ff_simple_idct44_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
573 /* IDCT4 on each line */
575 idct4row(block
+ i
*8);
578 /* IDCT4 and store */
580 idct4col_add(dest
+ i
, line_size
, block
+ i
);