2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * @file libavcodec/dct-test.c
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/common.h"
36 #include "libavutil/lfg.h"
38 #include "simple_idct.h"
39 #include "aandcttab.h"
42 #include "x86/idct_xvid.h"
46 void *fast_memcpy(void *a
, const void *b
, size_t c
){return memcpy(a
,b
,c
);};
48 /* reference fdct/idct */
49 void ff_ref_fdct(DCTELEM
*block
);
50 void ff_ref_idct(DCTELEM
*block
);
51 void ff_ref_dct_init(void);
53 void ff_mmx_idct(DCTELEM
*data
);
54 void ff_mmxext_idct(DCTELEM
*data
);
56 void odivx_idct_c(short *block
);
59 void ff_bfin_idct(DCTELEM
*block
);
60 void ff_bfin_fdct(DCTELEM
*block
);
63 void fdct_altivec(DCTELEM
*block
);
64 //void idct_altivec(DCTELEM *block);?? no routine
67 void j_rev_dct_ARM(DCTELEM
*data
);
68 void simple_idct_ARM(DCTELEM
*data
);
69 void simple_idct_armv5te(DCTELEM
*data
);
70 void ff_simple_idct_armv6(DCTELEM
*data
);
71 void ff_simple_idct_neon(DCTELEM
*data
);
73 void ff_simple_idct_axp(DCTELEM
*data
);
77 enum { FDCT
, IDCT
} is_idct
;
78 void (* func
) (DCTELEM
*block
);
79 void (* ref
) (DCTELEM
*block
);
80 enum formattag
{ NO_PERM
,MMX_PERM
, MMX_SIMPLE_PERM
, SCALE_PERM
, SSE2_PERM
, PARTTRANS_PERM
} format
;
84 #ifndef FAAN_POSTSCALE
85 #define FAAN_SCALE SCALE_PERM
87 #define FAAN_SCALE NO_PERM
92 struct algo algos
[] = {
93 {"REF-DBL", 0, ff_ref_fdct
, ff_ref_fdct
, NO_PERM
},
94 {"FAAN", 0, ff_faandct
, ff_ref_fdct
, FAAN_SCALE
},
95 {"FAANI", 1, ff_faanidct
, ff_ref_idct
, NO_PERM
},
96 {"IJG-AAN-INT", 0, fdct_ifast
, ff_ref_fdct
, SCALE_PERM
},
97 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow
, ff_ref_fdct
, NO_PERM
},
98 {"REF-DBL", 1, ff_ref_idct
, ff_ref_idct
, NO_PERM
},
99 {"INT", 1, j_rev_dct
, ff_ref_idct
, MMX_PERM
},
100 {"SIMPLE-C", 1, ff_simple_idct
, ff_ref_idct
, NO_PERM
},
103 {"MMX", 0, ff_fdct_mmx
, ff_ref_fdct
, NO_PERM
, FF_MM_MMX
},
105 {"MMX2", 0, ff_fdct_mmx2
, ff_ref_fdct
, NO_PERM
, FF_MM_MMX2
},
106 {"SSE2", 0, ff_fdct_sse2
, ff_ref_fdct
, NO_PERM
, FF_MM_SSE2
},
110 {"LIBMPEG2-MMX", 1, ff_mmx_idct
, ff_ref_idct
, MMX_PERM
, FF_MM_MMX
},
111 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct
, ff_ref_idct
, MMX_PERM
, FF_MM_MMX2
},
113 {"SIMPLE-MMX", 1, ff_simple_idct_mmx
, ff_ref_idct
, MMX_SIMPLE_PERM
, FF_MM_MMX
},
114 {"XVID-MMX", 1, ff_idct_xvid_mmx
, ff_ref_idct
, NO_PERM
, FF_MM_MMX
},
115 {"XVID-MMX2", 1, ff_idct_xvid_mmx2
, ff_ref_idct
, NO_PERM
, FF_MM_MMX2
},
116 {"XVID-SSE2", 1, ff_idct_xvid_sse2
, ff_ref_idct
, SSE2_PERM
, FF_MM_SSE2
},
120 {"altivecfdct", 0, fdct_altivec
, ff_ref_fdct
, NO_PERM
, FF_MM_ALTIVEC
},
124 {"BFINfdct", 0, ff_bfin_fdct
, ff_ref_fdct
, NO_PERM
},
125 {"BFINidct", 1, ff_bfin_idct
, ff_ref_idct
, NO_PERM
},
129 {"SIMPLE-ARM", 1, simple_idct_ARM
, ff_ref_idct
, NO_PERM
},
130 {"INT-ARM", 1, j_rev_dct_ARM
, ff_ref_idct
, MMX_PERM
},
132 {"SIMPLE-ARMV5TE", 1, simple_idct_armv5te
, ff_ref_idct
, NO_PERM
},
135 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6
, ff_ref_idct
, MMX_PERM
},
138 {"SIMPLE-NEON", 1, ff_simple_idct_neon
, ff_ref_idct
, PARTTRANS_PERM
},
140 #endif /* ARCH_ARM */
143 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp
, ff_ref_idct
, NO_PERM
},
149 #define AANSCALE_BITS 12
151 uint8_t cropTbl
[256 + 2 * MAX_NEG_CROP
];
153 static int64_t gettime(void)
156 gettimeofday(&tv
,NULL
);
157 return (int64_t)tv
.tv_sec
* 1000000 + tv
.tv_usec
;
161 #define NB_ITS_SPEED 50000
163 static short idct_mmx_perm
[64];
165 static short idct_simple_mmx_perm
[64]={
166 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
167 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
168 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
169 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
170 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
171 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
172 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
173 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
176 static const uint8_t idct_sse2_row_perm
[8] = {0, 4, 1, 5, 2, 6, 3, 7};
178 static void idct_mmx_init(void)
182 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
183 for (i
= 0; i
< 64; i
++) {
184 idct_mmx_perm
[i
] = (i
& 0x38) | ((i
& 6) >> 1) | ((i
& 1) << 2);
185 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
189 static DCTELEM block
[64] __attribute__ ((aligned (16)));
190 static DCTELEM block1
[64] __attribute__ ((aligned (8)));
191 static DCTELEM block_org
[64] __attribute__ ((aligned (8)));
193 static inline void mmx_emms(void)
196 if (cpu_flags
& FF_MM_MMX
)
197 __asm__
volatile ("emms\n\t");
201 static void dct_error(const char *name
, int is_idct
,
202 void (*fdct_func
)(DCTELEM
*block
),
203 void (*fdct_ref
)(DCTELEM
*block
), int form
, int test
)
207 int64_t err2
, ti
, ti1
, it1
;
208 int64_t sysErr
[64], sysErrMax
=0;
210 int blockSumErrMax
=0, blockSumErr
;
213 av_lfg_init(&prng
, 1);
217 for(i
=0; i
<64; i
++) sysErr
[i
]=0;
218 for(it
=0;it
<NB_ITS
;it
++) {
224 block1
[i
] = (av_lfg_get(&prng
) % 512) -256;
233 int num
= av_lfg_get(&prng
) % 10 + 1;
235 block1
[av_lfg_get(&prng
) % 64] = av_lfg_get(&prng
) % 512 -256;
238 block1
[0] = av_lfg_get(&prng
) % 4096 - 2048;
239 block1
[63]= (block1
[0]&1)^1;
243 #if 0 // simulate mismatch control
248 if((sum
&1)==0) block1
[63]^=1;
253 block_org
[i
]= block1
[i
];
255 if (form
== MMX_PERM
) {
257 block
[idct_mmx_perm
[i
]] = block1
[i
];
258 } else if (form
== MMX_SIMPLE_PERM
) {
260 block
[idct_simple_mmx_perm
[i
]] = block1
[i
];
262 } else if (form
== SSE2_PERM
) {
264 block
[(i
&0x38) | idct_sse2_row_perm
[i
&7]] = block1
[i
];
265 } else if (form
== PARTTRANS_PERM
) {
267 block
[(i
&0x24) | ((i
&3)<<3) | ((i
>>3)&3)] = block1
[i
];
272 #if 0 // simulate mismatch control for tested IDCT but not the ref
277 if((sum
&1)==0) block
[63]^=1;
284 if (form
== SCALE_PERM
) {
285 for(i
=0; i
<64; i
++) {
286 scale
= 8*(1 << (AANSCALE_BITS
+ 11)) / ff_aanscales
[i
];
287 block
[i
] = (block
[i
] * scale
/*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS
;
295 v
= abs(block
[i
] - block1
[i
]);
299 sysErr
[i
] += block
[i
] - block1
[i
];
301 if( abs(block
[i
])>maxout
) maxout
=abs(block
[i
]);
303 if(blockSumErrMax
< blockSumErr
) blockSumErrMax
= blockSumErr
;
304 #if 0 // print different matrix pairs
308 if((i
&7)==0) printf("\n");
309 printf("%4d ", block_org
[i
]);
312 if((i
&7)==0) printf("\n");
313 printf("%4d ", block
[i
] - block1
[i
]);
318 for(i
=0; i
<64; i
++) sysErrMax
= FFMAX(sysErrMax
, FFABS(sysErr
[i
]));
320 #if 1 // dump systematic errors
322 if(i
%8==0) printf("\n");
323 printf("%7d ", (int)sysErr
[i
]);
328 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
329 is_idct
? "IDCT" : "DCT",
330 name
, err_inf
, (double)err2
/ NB_ITS
/ 64.0, (double)sysErrMax
/ NB_ITS
, maxout
, blockSumErrMax
);
338 block1
[i
] = av_lfg_get(&prng
) % 512 -256;
348 block1
[0] = av_lfg_get(&prng
) % 512 -256;
349 block1
[1] = av_lfg_get(&prng
) % 512 -256;
350 block1
[2] = av_lfg_get(&prng
) % 512 -256;
351 block1
[3] = av_lfg_get(&prng
) % 512 -256;
355 if (form
== MMX_PERM
) {
357 block
[idct_mmx_perm
[i
]] = block1
[i
];
358 } else if(form
== MMX_SIMPLE_PERM
) {
360 block
[idct_simple_mmx_perm
[i
]] = block1
[i
];
369 for(it
=0;it
<NB_ITS_SPEED
;it
++) {
372 // memcpy(block, block1, sizeof(DCTELEM) * 64);
373 // do not memcpy especially not fastmemcpy because it does movntq !!!
377 ti1
= gettime() - ti
;
378 } while (ti1
< 1000000);
381 printf("%s %s: %0.1f kdct/s\n",
382 is_idct
? "IDCT" : "DCT",
383 name
, (double)it1
* 1000.0 / (double)ti1
);
387 static uint8_t img_dest
[64] __attribute__ ((aligned (8)));
388 static uint8_t img_dest1
[64] __attribute__ ((aligned (8)));
390 static void idct248_ref(uint8_t *dest
, int linesize
, int16_t *block
)
393 static double c8
[8][8];
394 static double c4
[4][4];
395 double block1
[64], block2
[64], block3
[64];
405 s
= (i
==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
406 c8
[i
][j
] = s
* cos(M_PI
* i
* (j
+ 0.5) / 8.0);
407 sum
+= c8
[i
][j
] * c8
[i
][j
];
414 s
= (i
==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
415 c4
[i
][j
] = s
* cos(M_PI
* i
* (j
+ 0.5) / 4.0);
416 sum
+= c4
[i
][j
] * c4
[i
][j
];
425 block1
[8*(2*i
)+j
] = (block
[8*(2*i
)+j
] + block
[8*(2*i
+1)+j
]) * s
;
426 block1
[8*(2*i
+1)+j
] = (block
[8*(2*i
)+j
] - block
[8*(2*i
+1)+j
]) * s
;
435 sum
+= c8
[k
][j
] * block1
[8*i
+k
];
446 sum
+= c4
[k
][j
] * block2
[8*(2*k
)+i
];
447 block3
[8*(2*j
)+i
] = sum
;
452 sum
+= c4
[k
][j
] * block2
[8*(2*k
+1)+i
];
453 block3
[8*(2*j
+1)+i
] = sum
;
457 /* clamp and store the result */
465 dest
[i
* linesize
+ j
] = (int)rint(v
);
470 static void idct248_error(const char *name
,
471 void (*idct248_put
)(uint8_t *dest
, int line_size
, int16_t *block
))
473 int it
, i
, it1
, ti
, ti1
, err_max
, v
;
477 av_lfg_init(&prng
, 1);
479 /* just one test to see if code is correct (precision is less
482 for(it
=0;it
<NB_ITS
;it
++) {
484 /* XXX: use forward transform to generate values */
486 block1
[i
] = av_lfg_get(&prng
) % 256 - 128;
491 idct248_ref(img_dest1
, 8, block
);
495 idct248_put(img_dest
, 8, block
);
498 v
= abs((int)img_dest
[i
] - (int)img_dest1
[i
]);
500 printf("%d %d\n", img_dest
[i
], img_dest1
[i
]);
509 printf(" %3d", img_dest1
[i
*8+j
]);
518 printf(" %3d", img_dest
[i
*8+j
]);
524 printf("%s %s: err_inf=%d\n",
525 1 ? "IDCT248" : "DCT248",
531 for(it
=0;it
<NB_ITS_SPEED
;it
++) {
534 // memcpy(block, block1, sizeof(DCTELEM) * 64);
535 // do not memcpy especially not fastmemcpy because it does movntq !!!
536 idct248_put(img_dest
, 8, block
);
539 ti1
= gettime() - ti
;
540 } while (ti1
< 1000000);
543 printf("%s %s: %0.1f kdct/s\n",
544 1 ? "IDCT248" : "DCT248",
545 name
, (double)it1
* 1000.0 / (double)ti1
);
548 static void help(void)
550 printf("dct-test [-i] [<test-number>]\n"
551 "test-number 0 -> test with random matrixes\n"
552 " 1 -> test with random sparse matrixes\n"
553 " 2 -> do 3. test from mpeg4 std\n"
554 "-i test IDCT implementations\n"
555 "-4 test IDCT248 implementations\n");
558 int main(int argc
, char **argv
)
560 int test_idct
= 0, test_248_dct
= 0;
563 cpu_flags
= mm_support();
568 for(i
=0;i
<256;i
++) cropTbl
[i
+ MAX_NEG_CROP
] = i
;
569 for(i
=0;i
<MAX_NEG_CROP
;i
++) {
571 cropTbl
[i
+ MAX_NEG_CROP
+ 256] = 255;
575 c
= getopt(argc
, argv
, "ih4");
592 if(optind
<argc
) test
= atoi(argv
[optind
]);
594 printf("ffmpeg DCT/IDCT test\n");
597 idct248_error("SIMPLE-C", ff_simple_idct248_put
);
599 for (i
=0;algos
[i
].name
;i
++)
600 if (algos
[i
].is_idct
== test_idct
&& !(~cpu_flags
& algos
[i
].mm_support
)) {
601 dct_error (algos
[i
].name
, algos
[i
].is_idct
, algos
[i
].func
, algos
[i
].ref
, algos
[i
].format
, test
);