00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045
00046 #undef printf
00047
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050
00051 void odivx_idct_c(short *block);
00052
00053
00054 void ff_bfin_idct(DCTELEM *block);
00055 void ff_bfin_fdct(DCTELEM *block);
00056
00057
00058 void fdct_altivec(DCTELEM *block);
00059
00060
00061
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067
00068 void ff_simple_idct_axp(DCTELEM *data);
00069
00070 struct algo {
00071 const char *name;
00072 void (*func)(DCTELEM *block);
00073 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
00074 SSE2_PERM, PARTTRANS_PERM } format;
00075 int mm_support;
00076 int nonspec;
00077 };
00078
00079 #ifndef FAAN_POSTSCALE
00080 #define FAAN_SCALE SCALE_PERM
00081 #else
00082 #define FAAN_SCALE NO_PERM
00083 #endif
00084
00085 static int cpu_flags;
00086
00087 static const struct algo fdct_tab[] = {
00088 { "REF-DBL", ff_ref_fdct, NO_PERM },
00089 { "FAAN", ff_faandct, FAAN_SCALE },
00090 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
00091 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
00092
00093 #if HAVE_MMX
00094 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
00095 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
00096 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
00097 #endif
00098
00099 #if HAVE_ALTIVEC
00100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
00101 #endif
00102
00103 #if ARCH_BFIN
00104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
00105 #endif
00106
00107 { 0 }
00108 };
00109
00110 static const struct algo idct_tab[] = {
00111 { "FAANI", ff_faanidct, NO_PERM },
00112 { "REF-DBL", ff_ref_idct, NO_PERM },
00113 { "INT", j_rev_dct, MMX_PERM },
00114 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
00115
00116 #if HAVE_MMX
00117 #if CONFIG_GPL
00118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
00119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
00120 #endif
00121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
00122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
00123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
00124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
00125 #endif
00126
00127 #if ARCH_BFIN
00128 { "BFINidct", ff_bfin_idct, NO_PERM },
00129 #endif
00130
00131 #if ARCH_ARM
00132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
00133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
00134 #endif
00135 #if HAVE_ARMV5TE
00136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
00137 #endif
00138 #if HAVE_ARMV6
00139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
00140 #endif
00141 #if HAVE_NEON
00142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
00143 #endif
00144
00145 #if ARCH_ALPHA
00146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
00147 #endif
00148
00149 { 0 }
00150 };
00151
00152 #define AANSCALE_BITS 12
00153
00154 static int64_t gettime(void)
00155 {
00156 struct timeval tv;
00157 gettimeofday(&tv, NULL);
00158 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00159 }
00160
00161 #define NB_ITS 20000
00162 #define NB_ITS_SPEED 50000
00163
00164 static short idct_mmx_perm[64];
00165
00166 static short idct_simple_mmx_perm[64] = {
00167 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00168 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00169 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00170 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00171 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00172 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00173 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00174 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00175 };
00176
00177 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
00178
00179 static void idct_mmx_init(void)
00180 {
00181 int i;
00182
00183
00184 for (i = 0; i < 64; i++) {
00185 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00186 }
00187 }
00188
00189 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00190 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00191
00192 static inline void mmx_emms(void)
00193 {
00194 #if HAVE_MMX
00195 if (cpu_flags & AV_CPU_FLAG_MMX)
00196 __asm__ volatile ("emms\n\t");
00197 #endif
00198 }
00199
00200 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
00201 {
00202 int i, j;
00203
00204 memset(block, 0, 64 * sizeof(*block));
00205
00206 switch (test) {
00207 case 0:
00208 for (i = 0; i < 64; i++)
00209 block[i] = (av_lfg_get(prng) % 512) - 256;
00210 if (is_idct) {
00211 ff_ref_fdct(block);
00212 for (i = 0; i < 64; i++)
00213 block[i] >>= 3;
00214 }
00215 break;
00216 case 1:
00217 j = av_lfg_get(prng) % 10 + 1;
00218 for (i = 0; i < j; i++)
00219 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
00220 break;
00221 case 2:
00222 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
00223 block[63] = (block[0] & 1) ^ 1;
00224 break;
00225 }
00226 }
00227
00228 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
00229 {
00230 int i;
00231
00232 if (perm == MMX_PERM) {
00233 for (i = 0; i < 64; i++)
00234 dst[idct_mmx_perm[i]] = src[i];
00235 } else if (perm == MMX_SIMPLE_PERM) {
00236 for (i = 0; i < 64; i++)
00237 dst[idct_simple_mmx_perm[i]] = src[i];
00238 } else if (perm == SSE2_PERM) {
00239 for (i = 0; i < 64; i++)
00240 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
00241 } else if (perm == PARTTRANS_PERM) {
00242 for (i = 0; i < 64; i++)
00243 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
00244 } else {
00245 for (i = 0; i < 64; i++)
00246 dst[i] = src[i];
00247 }
00248 }
00249
00250 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
00251 {
00252 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
00253 int it, i, scale;
00254 int err_inf, v;
00255 int64_t err2, ti, ti1, it1, err_sum = 0;
00256 int64_t sysErr[64], sysErrMax = 0;
00257 int maxout = 0;
00258 int blockSumErrMax = 0, blockSumErr;
00259 AVLFG prng;
00260 double omse, ome;
00261 int spec_err;
00262
00263 av_lfg_init(&prng, 1);
00264
00265 err_inf = 0;
00266 err2 = 0;
00267 for (i = 0; i < 64; i++)
00268 sysErr[i] = 0;
00269 for (it = 0; it < NB_ITS; it++) {
00270 init_block(block1, test, is_idct, &prng);
00271 permute(block, block1, dct->format);
00272
00273 dct->func(block);
00274 mmx_emms();
00275
00276 if (dct->format == SCALE_PERM) {
00277 for (i = 0; i < 64; i++) {
00278 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00279 block[i] = (block[i] * scale) >> AANSCALE_BITS;
00280 }
00281 }
00282
00283 ref(block1);
00284
00285 blockSumErr = 0;
00286 for (i = 0; i < 64; i++) {
00287 int err = block[i] - block1[i];
00288 err_sum += err;
00289 v = abs(err);
00290 if (v > err_inf)
00291 err_inf = v;
00292 err2 += v * v;
00293 sysErr[i] += block[i] - block1[i];
00294 blockSumErr += v;
00295 if (abs(block[i]) > maxout)
00296 maxout = abs(block[i]);
00297 }
00298 if (blockSumErrMax < blockSumErr)
00299 blockSumErrMax = blockSumErr;
00300 }
00301 for (i = 0; i < 64; i++)
00302 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
00303
00304 for (i = 0; i < 64; i++) {
00305 if (i % 8 == 0)
00306 printf("\n");
00307 printf("%7d ", (int) sysErr[i]);
00308 }
00309 printf("\n");
00310
00311 omse = (double) err2 / NB_ITS / 64;
00312 ome = (double) err_sum / NB_ITS / 64;
00313
00314 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
00315
00316 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00317 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
00318 omse, ome, (double) sysErrMax / NB_ITS,
00319 maxout, blockSumErrMax);
00320
00321 if (spec_err && !dct->nonspec)
00322 return 1;
00323
00324 if (!speed)
00325 return 0;
00326
00327
00328 init_block(block, test, is_idct, &prng);
00329 permute(block1, block, dct->format);
00330
00331 ti = gettime();
00332 it1 = 0;
00333 do {
00334 for (it = 0; it < NB_ITS_SPEED; it++) {
00335 memcpy(block, block1, sizeof(block));
00336 dct->func(block);
00337 }
00338 it1 += NB_ITS_SPEED;
00339 ti1 = gettime() - ti;
00340 } while (ti1 < 1000000);
00341 mmx_emms();
00342
00343 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
00344 (double) it1 * 1000.0 / (double) ti1);
00345
00346 return 0;
00347 }
00348
00349 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00350 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00351
00352 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00353 {
00354 static int init;
00355 static double c8[8][8];
00356 static double c4[4][4];
00357 double block1[64], block2[64], block3[64];
00358 double s, sum, v;
00359 int i, j, k;
00360
00361 if (!init) {
00362 init = 1;
00363
00364 for (i = 0; i < 8; i++) {
00365 sum = 0;
00366 for (j = 0; j < 8; j++) {
00367 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
00368 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00369 sum += c8[i][j] * c8[i][j];
00370 }
00371 }
00372
00373 for (i = 0; i < 4; i++) {
00374 sum = 0;
00375 for (j = 0; j < 4; j++) {
00376 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
00377 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00378 sum += c4[i][j] * c4[i][j];
00379 }
00380 }
00381 }
00382
00383
00384 s = 0.5 * sqrt(2.0);
00385 for (i = 0; i < 4; i++) {
00386 for (j = 0; j < 8; j++) {
00387 block1[8 * (2 * i) + j] =
00388 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
00389 block1[8 * (2 * i + 1) + j] =
00390 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
00391 }
00392 }
00393
00394
00395 for (i = 0; i < 8; i++) {
00396 for (j = 0; j < 8; j++) {
00397 sum = 0;
00398 for (k = 0; k < 8; k++)
00399 sum += c8[k][j] * block1[8 * i + k];
00400 block2[8 * i + j] = sum;
00401 }
00402 }
00403
00404
00405 for (i = 0; i < 8; i++) {
00406 for (j = 0; j < 4; j++) {
00407
00408 sum = 0;
00409 for (k = 0; k < 4; k++)
00410 sum += c4[k][j] * block2[8 * (2 * k) + i];
00411 block3[8 * (2 * j) + i] = sum;
00412
00413
00414 sum = 0;
00415 for (k = 0; k < 4; k++)
00416 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
00417 block3[8 * (2 * j + 1) + i] = sum;
00418 }
00419 }
00420
00421
00422 for (i = 0; i < 8; i++) {
00423 for (j = 0; j < 8; j++) {
00424 v = block3[8 * i + j];
00425 if (v < 0) v = 0;
00426 else if (v > 255) v = 255;
00427 dest[i * linesize + j] = (int) rint(v);
00428 }
00429 }
00430 }
00431
00432 static void idct248_error(const char *name,
00433 void (*idct248_put)(uint8_t *dest, int line_size,
00434 int16_t *block),
00435 int speed)
00436 {
00437 int it, i, it1, ti, ti1, err_max, v;
00438 AVLFG prng;
00439
00440 av_lfg_init(&prng, 1);
00441
00442
00443
00444 err_max = 0;
00445 for (it = 0; it < NB_ITS; it++) {
00446
00447 for (i = 0; i < 64; i++)
00448 block1[i] = av_lfg_get(&prng) % 256 - 128;
00449 block1[0] += 1024;
00450
00451 for (i = 0; i < 64; i++)
00452 block[i] = block1[i];
00453 idct248_ref(img_dest1, 8, block);
00454
00455 for (i = 0; i < 64; i++)
00456 block[i] = block1[i];
00457 idct248_put(img_dest, 8, block);
00458
00459 for (i = 0; i < 64; i++) {
00460 v = abs((int) img_dest[i] - (int) img_dest1[i]);
00461 if (v == 255)
00462 printf("%d %d\n", img_dest[i], img_dest1[i]);
00463 if (v > err_max)
00464 err_max = v;
00465 }
00466 }
00467 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
00468
00469 if (!speed)
00470 return;
00471
00472 ti = gettime();
00473 it1 = 0;
00474 do {
00475 for (it = 0; it < NB_ITS_SPEED; it++) {
00476 for (i = 0; i < 64; i++)
00477 block[i] = block1[i];
00478 idct248_put(img_dest, 8, block);
00479 }
00480 it1 += NB_ITS_SPEED;
00481 ti1 = gettime() - ti;
00482 } while (ti1 < 1000000);
00483 mmx_emms();
00484
00485 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
00486 (double) it1 * 1000.0 / (double) ti1);
00487 }
00488
00489 static void help(void)
00490 {
00491 printf("dct-test [-i] [<test-number>]\n"
00492 "test-number 0 -> test with random matrixes\n"
00493 " 1 -> test with random sparse matrixes\n"
00494 " 2 -> do 3. test from mpeg4 std\n"
00495 "-i test IDCT implementations\n"
00496 "-4 test IDCT248 implementations\n"
00497 "-t speed test\n");
00498 }
00499
00500 int main(int argc, char **argv)
00501 {
00502 int test_idct = 0, test_248_dct = 0;
00503 int c, i;
00504 int test = 1;
00505 int speed = 0;
00506 int err = 0;
00507
00508 cpu_flags = av_get_cpu_flags();
00509
00510 ff_ref_dct_init();
00511 idct_mmx_init();
00512
00513 for (;;) {
00514 c = getopt(argc, argv, "ih4t");
00515 if (c == -1)
00516 break;
00517 switch (c) {
00518 case 'i':
00519 test_idct = 1;
00520 break;
00521 case '4':
00522 test_248_dct = 1;
00523 break;
00524 case 't':
00525 speed = 1;
00526 break;
00527 default:
00528 case 'h':
00529 help();
00530 return 0;
00531 }
00532 }
00533
00534 if (optind < argc)
00535 test = atoi(argv[optind]);
00536
00537 printf("Libav DCT/IDCT test\n");
00538
00539 if (test_248_dct) {
00540 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
00541 } else {
00542 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
00543 for (i = 0; algos[i].name; i++)
00544 if (!(~cpu_flags & algos[i].mm_support)) {
00545 err |= dct_error(&algos[i], test, test_idct, speed);
00546 }
00547 }
00548
00549 return err;
00550 }