2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test. (c) 2001 Fabrice Bellard.
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/common.h"
37 #include "simple_idct.h"
40 #include "i386/idct_xvid.h"
45 void *fast_memcpy(void *a
, const void *b
, size_t c
){return memcpy(a
,b
,c
);};
47 /* reference fdct/idct */
48 extern void fdct(DCTELEM
*block
);
49 extern void idct(DCTELEM
*block
);
50 extern void init_fdct();
52 extern void ff_mmx_idct(DCTELEM
*data
);
53 extern void ff_mmxext_idct(DCTELEM
*data
);
55 extern void odivx_idct_c (short *block
);
58 extern void ff_bfin_idct (DCTELEM
*block
) ;
59 extern void ff_bfin_fdct (DCTELEM
*block
) ;
62 extern void fdct_altivec (DCTELEM
*block
);
63 //extern void idct_altivec (DCTELEM *block);?? no routine
68 enum { FDCT
, IDCT
} is_idct
;
69 void (* func
) (DCTELEM
*block
);
70 void (* ref
) (DCTELEM
*block
);
71 enum formattag
{ NO_PERM
,MMX_PERM
, MMX_SIMPLE_PERM
, SCALE_PERM
, SSE2_PERM
} format
;
75 #ifndef FAAN_POSTSCALE
76 #define FAAN_SCALE SCALE_PERM
78 #define FAAN_SCALE NO_PERM
83 struct algo algos
[] = {
84 {"REF-DBL", 0, fdct
, fdct
, NO_PERM
},
85 {"FAAN", 0, ff_faandct
, fdct
, FAAN_SCALE
},
86 {"FAANI", 1, ff_faanidct
, idct
, NO_PERM
},
87 {"IJG-AAN-INT", 0, fdct_ifast
, fdct
, SCALE_PERM
},
88 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow
, fdct
, NO_PERM
},
89 {"REF-DBL", 1, idct
, idct
, NO_PERM
},
90 {"INT", 1, j_rev_dct
, idct
, MMX_PERM
},
91 {"SIMPLE-C", 1, ff_simple_idct
, idct
, NO_PERM
},
94 {"MMX", 0, ff_fdct_mmx
, fdct
, NO_PERM
, FF_MM_MMX
},
96 {"MMX2", 0, ff_fdct_mmx2
, fdct
, NO_PERM
, FF_MM_MMXEXT
},
97 {"SSE2", 0, ff_fdct_sse2
, fdct
, NO_PERM
, FF_MM_SSE2
},
101 {"LIBMPEG2-MMX", 1, ff_mmx_idct
, idct
, MMX_PERM
, FF_MM_MMX
},
102 {"LIBMPEG2-MMXEXT", 1, ff_mmxext_idct
, idct
, MMX_PERM
, FF_MM_MMXEXT
},
104 {"SIMPLE-MMX", 1, ff_simple_idct_mmx
, idct
, MMX_SIMPLE_PERM
, FF_MM_MMX
},
105 {"XVID-MMX", 1, ff_idct_xvid_mmx
, idct
, NO_PERM
, FF_MM_MMX
},
106 {"XVID-MMX2", 1, ff_idct_xvid_mmx2
, idct
, NO_PERM
, FF_MM_MMXEXT
},
107 {"XVID-SSE2", 1, ff_idct_xvid_sse2
, idct
, SSE2_PERM
, FF_MM_SSE2
},
111 {"altivecfdct", 0, fdct_altivec
, fdct
, NO_PERM
, FF_MM_ALTIVEC
},
115 {"BFINfdct", 0, ff_bfin_fdct
, fdct
, NO_PERM
},
116 {"BFINidct", 1, ff_bfin_idct
, idct
, NO_PERM
},
122 #define AANSCALE_BITS 12
123 static const unsigned short aanscales
[64] = {
124 /* precomputed values scaled up by 14 bits */
125 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
126 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
127 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
128 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
129 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
130 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
131 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
132 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
135 uint8_t cropTbl
[256 + 2 * MAX_NEG_CROP
];
137 int64_t gettime(void)
140 gettimeofday(&tv
,NULL
);
141 return (int64_t)tv
.tv_sec
* 1000000 + tv
.tv_usec
;
145 #define NB_ITS_SPEED 50000
147 static short idct_mmx_perm
[64];
149 static short idct_simple_mmx_perm
[64]={
150 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
151 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
152 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
153 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
154 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
155 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
156 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
157 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
160 static const uint8_t idct_sse2_row_perm
[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162 void idct_mmx_init(void)
166 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
167 for (i
= 0; i
< 64; i
++) {
168 idct_mmx_perm
[i
] = (i
& 0x38) | ((i
& 6) >> 1) | ((i
& 1) << 2);
169 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
173 static DCTELEM block
[64] __attribute__ ((aligned (16)));
174 static DCTELEM block1
[64] __attribute__ ((aligned (8)));
175 static DCTELEM block_org
[64] __attribute__ ((aligned (8)));
177 static inline void mmx_emms(void)
180 if (cpu_flags
& FF_MM_MMX
)
181 __asm__
volatile ("emms\n\t");
185 void dct_error(const char *name
, int is_idct
,
186 void (*fdct_func
)(DCTELEM
*block
),
187 void (*fdct_ref
)(DCTELEM
*block
), int form
, int test
)
191 int64_t err2
, ti
, ti1
, it1
;
192 int64_t sysErr
[64], sysErrMax
=0;
194 int blockSumErrMax
=0, blockSumErr
;
200 for(i
=0; i
<64; i
++) sysErr
[i
]=0;
201 for(it
=0;it
<NB_ITS
;it
++) {
207 block1
[i
] = (random() % 512) -256;
216 int num
= (random()%10)+1;
218 block1
[random()%64] = (random() % 512) -256;
221 block1
[0]= (random()%4096)-2048;
222 block1
[63]= (block1
[0]&1)^1;
226 #if 0 // simulate mismatch control
231 if((sum
&1)==0) block1
[63]^=1;
236 block_org
[i
]= block1
[i
];
238 if (form
== MMX_PERM
) {
240 block
[idct_mmx_perm
[i
]] = block1
[i
];
241 } else if (form
== MMX_SIMPLE_PERM
) {
243 block
[idct_simple_mmx_perm
[i
]] = block1
[i
];
245 } else if (form
== SSE2_PERM
) {
247 block
[(i
&0x38) | idct_sse2_row_perm
[i
&7]] = block1
[i
];
252 #if 0 // simulate mismatch control for tested IDCT but not the ref
257 if((sum
&1)==0) block
[63]^=1;
264 if (form
== SCALE_PERM
) {
265 for(i
=0; i
<64; i
++) {
266 scale
= 8*(1 << (AANSCALE_BITS
+ 11)) / aanscales
[i
];
267 block
[i
] = (block
[i
] * scale
/*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS
;
275 v
= abs(block
[i
] - block1
[i
]);
279 sysErr
[i
] += block
[i
] - block1
[i
];
281 if( abs(block
[i
])>maxout
) maxout
=abs(block
[i
]);
283 if(blockSumErrMax
< blockSumErr
) blockSumErrMax
= blockSumErr
;
284 #if 0 // print different matrix pairs
288 if((i
&7)==0) printf("\n");
289 printf("%4d ", block_org
[i
]);
292 if((i
&7)==0) printf("\n");
293 printf("%4d ", block
[i
] - block1
[i
]);
298 for(i
=0; i
<64; i
++) sysErrMax
= FFMAX(sysErrMax
, FFABS(sysErr
[i
]));
300 #if 1 // dump systematic errors
302 if(i
%8==0) printf("\n");
303 printf("%5d ", (int)sysErr
[i
]);
308 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
309 is_idct
? "IDCT" : "DCT",
310 name
, err_inf
, (double)err2
/ NB_ITS
/ 64.0, (double)sysErrMax
/ NB_ITS
, maxout
, blockSumErrMax
);
318 block1
[i
] = (random() % 512) -256;
328 block1
[0] = (random() % 512) -256;
329 block1
[1] = (random() % 512) -256;
330 block1
[2] = (random() % 512) -256;
331 block1
[3] = (random() % 512) -256;
335 if (form
== MMX_PERM
) {
337 block
[idct_mmx_perm
[i
]] = block1
[i
];
338 } else if(form
== MMX_SIMPLE_PERM
) {
340 block
[idct_simple_mmx_perm
[i
]] = block1
[i
];
349 for(it
=0;it
<NB_ITS_SPEED
;it
++) {
352 // memcpy(block, block1, sizeof(DCTELEM) * 64);
353 // do not memcpy especially not fastmemcpy because it does movntq !!!
357 ti1
= gettime() - ti
;
358 } while (ti1
< 1000000);
361 printf("%s %s: %0.1f kdct/s\n",
362 is_idct
? "IDCT" : "DCT",
363 name
, (double)it1
* 1000.0 / (double)ti1
);
367 static uint8_t img_dest
[64] __attribute__ ((aligned (8)));
368 static uint8_t img_dest1
[64] __attribute__ ((aligned (8)));
370 void idct248_ref(uint8_t *dest
, int linesize
, int16_t *block
)
373 static double c8
[8][8];
374 static double c4
[4][4];
375 double block1
[64], block2
[64], block3
[64];
385 s
= (i
==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
386 c8
[i
][j
] = s
* cos(M_PI
* i
* (j
+ 0.5) / 8.0);
387 sum
+= c8
[i
][j
] * c8
[i
][j
];
394 s
= (i
==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
395 c4
[i
][j
] = s
* cos(M_PI
* i
* (j
+ 0.5) / 4.0);
396 sum
+= c4
[i
][j
] * c4
[i
][j
];
405 block1
[8*(2*i
)+j
] = (block
[8*(2*i
)+j
] + block
[8*(2*i
+1)+j
]) * s
;
406 block1
[8*(2*i
+1)+j
] = (block
[8*(2*i
)+j
] - block
[8*(2*i
+1)+j
]) * s
;
415 sum
+= c8
[k
][j
] * block1
[8*i
+k
];
426 sum
+= c4
[k
][j
] * block2
[8*(2*k
)+i
];
427 block3
[8*(2*j
)+i
] = sum
;
432 sum
+= c4
[k
][j
] * block2
[8*(2*k
+1)+i
];
433 block3
[8*(2*j
+1)+i
] = sum
;
437 /* clamp and store the result */
445 dest
[i
* linesize
+ j
] = (int)rint(v
);
450 void idct248_error(const char *name
,
451 void (*idct248_put
)(uint8_t *dest
, int line_size
, int16_t *block
))
453 int it
, i
, it1
, ti
, ti1
, err_max
, v
;
457 /* just one test to see if code is correct (precision is less
460 for(it
=0;it
<NB_ITS
;it
++) {
462 /* XXX: use forward transform to generate values */
464 block1
[i
] = (random() % 256) - 128;
469 idct248_ref(img_dest1
, 8, block
);
473 idct248_put(img_dest
, 8, block
);
476 v
= abs((int)img_dest
[i
] - (int)img_dest1
[i
]);
478 printf("%d %d\n", img_dest
[i
], img_dest1
[i
]);
487 printf(" %3d", img_dest1
[i
*8+j
]);
496 printf(" %3d", img_dest
[i
*8+j
]);
502 printf("%s %s: err_inf=%d\n",
503 1 ? "IDCT248" : "DCT248",
509 for(it
=0;it
<NB_ITS_SPEED
;it
++) {
512 // memcpy(block, block1, sizeof(DCTELEM) * 64);
513 // do not memcpy especially not fastmemcpy because it does movntq !!!
514 idct248_put(img_dest
, 8, block
);
517 ti1
= gettime() - ti
;
518 } while (ti1
< 1000000);
521 printf("%s %s: %0.1f kdct/s\n",
522 1 ? "IDCT248" : "DCT248",
523 name
, (double)it1
* 1000.0 / (double)ti1
);
528 printf("dct-test [-i] [<test-number>]\n"
529 "test-number 0 -> test with random matrixes\n"
530 " 1 -> test with random sparse matrixes\n"
531 " 2 -> do 3. test from mpeg4 std\n"
532 "-i test IDCT implementations\n"
533 "-4 test IDCT248 implementations\n");
536 int main(int argc
, char **argv
)
538 int test_idct
= 0, test_248_dct
= 0;
541 cpu_flags
= mm_support();
546 for(i
=0;i
<256;i
++) cropTbl
[i
+ MAX_NEG_CROP
] = i
;
547 for(i
=0;i
<MAX_NEG_CROP
;i
++) {
549 cropTbl
[i
+ MAX_NEG_CROP
+ 256] = 255;
553 c
= getopt(argc
, argv
, "ih4");
570 if(optind
<argc
) test
= atoi(argv
[optind
]);
572 printf("ffmpeg DCT/IDCT test\n");
575 idct248_error("SIMPLE-C", ff_simple_idct248_put
);
577 for (i
=0;algos
[i
].name
;i
++)
578 if (algos
[i
].is_idct
== test_idct
&& !(~cpu_flags
& algos
[i
].mm_support
)) {
579 dct_error (algos
[i
].name
, algos
[i
].is_idct
, algos
[i
].func
, algos
[i
].ref
, algos
[i
].format
, test
);