2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 enum { FDCT, IDCT } is_idct;
73 void (*func)(DCTELEM *block);
74 void (*ref) (DCTELEM *block);
75 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
76 SSE2_PERM, PARTTRANS_PERM } format;
80 #ifndef FAAN_POSTSCALE
81 #define FAAN_SCALE SCALE_PERM
83 #define FAAN_SCALE NO_PERM
88 struct algo algos[] = {
89 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
90 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
91 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
92 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
93 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
94 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
95 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
96 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
99 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
101 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
102 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
106 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
107 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
109 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
110 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
111 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
112 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
116 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
120 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
121 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
125 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
126 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
128 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
131 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
134 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
136 #endif /* ARCH_ARM */
139 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
145 #define AANSCALE_BITS 12
147 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
149 static int64_t gettime(void)
152 gettimeofday(&tv, NULL);
153 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
157 #define NB_ITS_SPEED 50000
159 static short idct_mmx_perm[64];
161 static short idct_simple_mmx_perm[64] = {
162 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
163 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
164 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
165 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
166 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
167 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
168 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
169 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
172 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
174 static void idct_mmx_init(void)
178 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
179 for (i = 0; i < 64; i++) {
180 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
188 static inline void mmx_emms(void)
191 if (cpu_flags & AV_CPU_FLAG_MMX)
192 __asm__ volatile ("emms\n\t");
196 static void dct_error(const struct algo *dct, int test)
200 int64_t err2, ti, ti1, it1;
201 int64_t sysErr[64], sysErrMax = 0;
203 int blockSumErrMax = 0, blockSumErr;
206 av_lfg_init(&prng, 1);
210 for (i = 0; i < 64; i++)
212 for (it = 0; it < NB_ITS; it++) {
213 for (i = 0; i < 64; i++)
217 for (i = 0; i < 64; i++)
218 block1[i] = (av_lfg_get(&prng) % 512) - 256;
221 for (i = 0; i < 64; i++)
226 int num = av_lfg_get(&prng) % 10 + 1;
227 for (i = 0; i < num; i++)
228 block1[av_lfg_get(&prng) % 64] =
229 av_lfg_get(&prng) % 512 - 256;
233 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
234 block1[63] = (block1[0] & 1) ^ 1;
238 for (i = 0; i < 64; i++)
239 block_org[i] = block1[i];
241 if (dct->format == MMX_PERM) {
242 for (i = 0; i < 64; i++)
243 block[idct_mmx_perm[i]] = block1[i];
244 } else if (dct->format == MMX_SIMPLE_PERM) {
245 for (i = 0; i < 64; i++)
246 block[idct_simple_mmx_perm[i]] = block1[i];
247 } else if (dct->format == SSE2_PERM) {
248 for (i = 0; i < 64; i++)
249 block[(i & 0x38) | idct_sse2_row_perm[i & 7]] = block1[i];
250 } else if (dct->format == PARTTRANS_PERM) {
251 for (i = 0; i < 64; i++)
252 block[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = block1[i];
254 for (i = 0; i < 64; i++)
255 block[i] = block1[i];
261 if (dct->format == SCALE_PERM) {
262 for (i = 0; i < 64; i++) {
263 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
264 block[i] = (block[i] * scale) >> AANSCALE_BITS;
271 for (i = 0; i < 64; i++) {
272 v = abs(block[i] - block1[i]);
276 sysErr[i] += block[i] - block1[i];
278 if (abs(block[i]) > maxout)
279 maxout = abs(block[i]);
281 if (blockSumErrMax < blockSumErr)
282 blockSumErrMax = blockSumErr;
284 for (i = 0; i < 64; i++)
285 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
287 for (i = 0; i < 64; i++) {
290 printf("%7d ", (int) sysErr[i]);
294 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
295 dct->is_idct ? "IDCT" : "DCT", dct->name, err_inf,
296 (double) err2 / NB_ITS / 64.0, (double) sysErrMax / NB_ITS,
297 maxout, blockSumErrMax);
300 for (i = 0; i < 64; i++)
305 for (i = 0; i < 64; i++)
306 block1[i] = av_lfg_get(&prng) % 512 - 256;
309 for (i = 0; i < 64; i++)
315 block1[0] = av_lfg_get(&prng) % 512 - 256;
316 block1[1] = av_lfg_get(&prng) % 512 - 256;
317 block1[2] = av_lfg_get(&prng) % 512 - 256;
318 block1[3] = av_lfg_get(&prng) % 512 - 256;
322 if (dct->format == MMX_PERM) {
323 for (i = 0; i < 64; i++)
324 block[idct_mmx_perm[i]] = block1[i];
325 } else if (dct->format == MMX_SIMPLE_PERM) {
326 for (i = 0; i < 64; i++)
327 block[idct_simple_mmx_perm[i]] = block1[i];
329 for (i = 0; i < 64; i++)
330 block[i] = block1[i];
336 for (it = 0; it < NB_ITS_SPEED; it++) {
337 for (i = 0; i < 64; i++)
338 block[i] = block1[i];
342 ti1 = gettime() - ti;
343 } while (ti1 < 1000000);
346 printf("%s %s: %0.1f kdct/s\n", dct->is_idct ? "IDCT" : "DCT", dct->name,
347 (double) it1 * 1000.0 / (double) ti1);
350 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
351 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
353 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
356 static double c8[8][8];
357 static double c4[4][4];
358 double block1[64], block2[64], block3[64];
365 for (i = 0; i < 8; i++) {
367 for (j = 0; j < 8; j++) {
368 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
369 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
370 sum += c8[i][j] * c8[i][j];
374 for (i = 0; i < 4; i++) {
376 for (j = 0; j < 4; j++) {
377 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
378 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
379 sum += c4[i][j] * c4[i][j];
386 for (i = 0; i < 4; i++) {
387 for (j = 0; j < 8; j++) {
388 block1[8 * (2 * i) + j] =
389 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
390 block1[8 * (2 * i + 1) + j] =
391 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
396 for (i = 0; i < 8; i++) {
397 for (j = 0; j < 8; j++) {
399 for (k = 0; k < 8; k++)
400 sum += c8[k][j] * block1[8 * i + k];
401 block2[8 * i + j] = sum;
406 for (i = 0; i < 8; i++) {
407 for (j = 0; j < 4; j++) {
410 for (k = 0; k < 4; k++)
411 sum += c4[k][j] * block2[8 * (2 * k) + i];
412 block3[8 * (2 * j) + i] = sum;
416 for (k = 0; k < 4; k++)
417 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
418 block3[8 * (2 * j + 1) + i] = sum;
422 /* clamp and store the result */
423 for (i = 0; i < 8; i++) {
424 for (j = 0; j < 8; j++) {
425 v = block3[8 * i + j];
427 else if (v > 255) v = 255;
428 dest[i * linesize + j] = (int) rint(v);
433 static void idct248_error(const char *name,
434 void (*idct248_put)(uint8_t *dest, int line_size,
437 int it, i, it1, ti, ti1, err_max, v;
440 av_lfg_init(&prng, 1);
442 /* just one test to see if code is correct (precision is less
445 for (it = 0; it < NB_ITS; it++) {
446 /* XXX: use forward transform to generate values */
447 for (i = 0; i < 64; i++)
448 block1[i] = av_lfg_get(&prng) % 256 - 128;
451 for (i = 0; i < 64; i++)
452 block[i] = block1[i];
453 idct248_ref(img_dest1, 8, block);
455 for (i = 0; i < 64; i++)
456 block[i] = block1[i];
457 idct248_put(img_dest, 8, block);
459 for (i = 0; i < 64; i++) {
460 v = abs((int) img_dest[i] - (int) img_dest1[i]);
462 printf("%d %d\n", img_dest[i], img_dest1[i]);
467 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
472 for (it = 0; it < NB_ITS_SPEED; it++) {
473 for (i = 0; i < 64; i++)
474 block[i] = block1[i];
475 idct248_put(img_dest, 8, block);
478 ti1 = gettime() - ti;
479 } while (ti1 < 1000000);
482 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
483 (double) it1 * 1000.0 / (double) ti1);
486 static void help(void)
488 printf("dct-test [-i] [<test-number>]\n"
489 "test-number 0 -> test with random matrixes\n"
490 " 1 -> test with random sparse matrixes\n"
491 " 2 -> do 3. test from mpeg4 std\n"
492 "-i test IDCT implementations\n"
493 "-4 test IDCT248 implementations\n");
496 int main(int argc, char **argv)
498 int test_idct = 0, test_248_dct = 0;
502 cpu_flags = av_get_cpu_flags();
507 for (i = 0; i < 256; i++)
508 cropTbl[i + MAX_NEG_CROP] = i;
509 for (i = 0; i < MAX_NEG_CROP; i++) {
511 cropTbl[i + MAX_NEG_CROP + 256] = 255;
515 c = getopt(argc, argv, "ih4");
533 test = atoi(argv[optind]);
535 printf("ffmpeg DCT/IDCT test\n");
538 idct248_error("SIMPLE-C", ff_simple_idct248_put);
540 for (i = 0; algos[i].name; i++)
541 if (algos[i].is_idct == test_idct &&
542 !(~cpu_flags & algos[i].mm_support)) {
543 dct_error(&algos[i], test);