Libav
|
00001 /* 00002 * (c) 2001 Fabrice Bellard 00003 * 2007 Marc Hoffman <marc.hoffman@analog.com> 00004 * 00005 * This file is part of FFmpeg. 00006 * 00007 * FFmpeg is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * FFmpeg is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with FFmpeg; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00028 #include <stdlib.h> 00029 #include <stdio.h> 00030 #include <string.h> 00031 #include <sys/time.h> 00032 #include <unistd.h> 00033 #include <math.h> 00034 00035 #include "libavutil/common.h" 00036 #include "libavutil/lfg.h" 00037 00038 #include "simple_idct.h" 00039 #include "aandcttab.h" 00040 #include "faandct.h" 00041 #include "faanidct.h" 00042 #include "x86/idct_xvid.h" 00043 #include "dctref.h" 00044 00045 #undef printf 00046 00047 void ff_mmx_idct(DCTELEM *data); 00048 void ff_mmxext_idct(DCTELEM *data); 00049 00050 void odivx_idct_c(short *block); 00051 00052 // BFIN 00053 void ff_bfin_idct(DCTELEM *block); 00054 void ff_bfin_fdct(DCTELEM *block); 00055 00056 // ALTIVEC 00057 void fdct_altivec(DCTELEM *block); 00058 //void idct_altivec(DCTELEM *block);?? no routine 00059 00060 // ARM 00061 void ff_j_rev_dct_arm(DCTELEM *data); 00062 void ff_simple_idct_arm(DCTELEM *data); 00063 void ff_simple_idct_armv5te(DCTELEM *data); 00064 void ff_simple_idct_armv6(DCTELEM *data); 00065 void ff_simple_idct_neon(DCTELEM *data); 00066 00067 void ff_simple_idct_axp(DCTELEM *data); 00068 00069 struct algo { 00070 const char *name; 00071 enum { FDCT, IDCT } is_idct; 00072 void (* func) (DCTELEM *block); 00073 void (* ref) (DCTELEM *block); 00074 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format; 00075 int mm_support; 00076 }; 00077 00078 #ifndef FAAN_POSTSCALE 00079 #define FAAN_SCALE SCALE_PERM 00080 #else 00081 #define FAAN_SCALE NO_PERM 00082 #endif 00083 00084 static int cpu_flags; 00085 00086 struct algo algos[] = { 00087 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM}, 00088 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE}, 00089 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM}, 00090 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM}, 00091 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM}, 00092 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM}, 00093 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM}, 00094 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM}, 00095 00096 #if HAVE_MMX 00097 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX}, 00098 #if HAVE_MMX2 00099 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMX2}, 00100 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2}, 00101 #endif 00102 00103 #if CONFIG_GPL 00104 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX}, 00105 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX2}, 00106 #endif 00107 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX}, 00108 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX}, 00109 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMX2}, 00110 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2}, 00111 #endif 00112 00113 #if HAVE_ALTIVEC 00114 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC}, 00115 #endif 00116 00117 #if ARCH_BFIN 00118 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM}, 00119 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM}, 00120 #endif 00121 00122 #if ARCH_ARM 00123 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM }, 00124 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM }, 00125 #if HAVE_ARMV5TE 00126 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM }, 00127 #endif 00128 #if HAVE_ARMV6 00129 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM }, 00130 #endif 00131 #if HAVE_NEON 00132 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM }, 00133 #endif 00134 #endif /* ARCH_ARM */ 00135 00136 #if ARCH_ALPHA 00137 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM }, 00138 #endif 00139 00140 { 0 } 00141 }; 00142 00143 #define AANSCALE_BITS 12 00144 00145 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; 00146 00147 static int64_t gettime(void) 00148 { 00149 struct timeval tv; 00150 gettimeofday(&tv,NULL); 00151 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; 00152 } 00153 00154 #define NB_ITS 20000 00155 #define NB_ITS_SPEED 50000 00156 00157 static short idct_mmx_perm[64]; 00158 00159 static short idct_simple_mmx_perm[64]={ 00160 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 00161 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 00162 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 00163 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 00164 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 00165 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 00166 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 00167 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 00168 }; 00169 00170 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; 00171 00172 static void idct_mmx_init(void) 00173 { 00174 int i; 00175 00176 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ 00177 for (i = 0; i < 64; i++) { 00178 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); 00179 // idct_simple_mmx_perm[i] = simple_block_permute_op(i); 00180 } 00181 } 00182 00183 DECLARE_ALIGNED(16, static DCTELEM, block)[64]; 00184 DECLARE_ALIGNED(8, static DCTELEM, block1)[64]; 00185 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64]; 00186 00187 static inline void mmx_emms(void) 00188 { 00189 #if HAVE_MMX 00190 if (cpu_flags & FF_MM_MMX) 00191 __asm__ volatile ("emms\n\t"); 00192 #endif 00193 } 00194 00195 static void dct_error(const char *name, int is_idct, 00196 void (*fdct_func)(DCTELEM *block), 00197 void (*fdct_ref)(DCTELEM *block), int form, int test) 00198 { 00199 int it, i, scale; 00200 int err_inf, v; 00201 int64_t err2, ti, ti1, it1; 00202 int64_t sysErr[64], sysErrMax=0; 00203 int maxout=0; 00204 int blockSumErrMax=0, blockSumErr; 00205 AVLFG prng; 00206 00207 av_lfg_init(&prng, 1); 00208 00209 err_inf = 0; 00210 err2 = 0; 00211 for(i=0; i<64; i++) sysErr[i]=0; 00212 for(it=0;it<NB_ITS;it++) { 00213 for(i=0;i<64;i++) 00214 block1[i] = 0; 00215 switch(test){ 00216 case 0: 00217 for(i=0;i<64;i++) 00218 block1[i] = (av_lfg_get(&prng) % 512) -256; 00219 if (is_idct){ 00220 ff_ref_fdct(block1); 00221 00222 for(i=0;i<64;i++) 00223 block1[i]>>=3; 00224 } 00225 break; 00226 case 1:{ 00227 int num = av_lfg_get(&prng) % 10 + 1; 00228 for(i=0;i<num;i++) 00229 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256; 00230 }break; 00231 case 2: 00232 block1[0] = av_lfg_get(&prng) % 4096 - 2048; 00233 block1[63]= (block1[0]&1)^1; 00234 break; 00235 } 00236 00237 #if 0 // simulate mismatch control 00238 { int sum=0; 00239 for(i=0;i<64;i++) 00240 sum+=block1[i]; 00241 00242 if((sum&1)==0) block1[63]^=1; 00243 } 00244 #endif 00245 00246 for(i=0; i<64; i++) 00247 block_org[i]= block1[i]; 00248 00249 if (form == MMX_PERM) { 00250 for(i=0;i<64;i++) 00251 block[idct_mmx_perm[i]] = block1[i]; 00252 } else if (form == MMX_SIMPLE_PERM) { 00253 for(i=0;i<64;i++) 00254 block[idct_simple_mmx_perm[i]] = block1[i]; 00255 00256 } else if (form == SSE2_PERM) { 00257 for(i=0; i<64; i++) 00258 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i]; 00259 } else if (form == PARTTRANS_PERM) { 00260 for(i=0; i<64; i++) 00261 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i]; 00262 } else { 00263 for(i=0; i<64; i++) 00264 block[i]= block1[i]; 00265 } 00266 #if 0 // simulate mismatch control for tested IDCT but not the ref 00267 { int sum=0; 00268 for(i=0;i<64;i++) 00269 sum+=block[i]; 00270 00271 if((sum&1)==0) block[63]^=1; 00272 } 00273 #endif 00274 00275 fdct_func(block); 00276 mmx_emms(); 00277 00278 if (form == SCALE_PERM) { 00279 for(i=0; i<64; i++) { 00280 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i]; 00281 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS; 00282 } 00283 } 00284 00285 fdct_ref(block1); 00286 00287 blockSumErr=0; 00288 for(i=0;i<64;i++) { 00289 v = abs(block[i] - block1[i]); 00290 if (v > err_inf) 00291 err_inf = v; 00292 err2 += v * v; 00293 sysErr[i] += block[i] - block1[i]; 00294 blockSumErr += v; 00295 if( abs(block[i])>maxout) maxout=abs(block[i]); 00296 } 00297 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr; 00298 #if 0 // print different matrix pairs 00299 if(blockSumErr){ 00300 printf("\n"); 00301 for(i=0; i<64; i++){ 00302 if((i&7)==0) printf("\n"); 00303 printf("%4d ", block_org[i]); 00304 } 00305 for(i=0; i<64; i++){ 00306 if((i&7)==0) printf("\n"); 00307 printf("%4d ", block[i] - block1[i]); 00308 } 00309 } 00310 #endif 00311 } 00312 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i])); 00313 00314 #if 1 // dump systematic errors 00315 for(i=0; i<64; i++){ 00316 if(i%8==0) printf("\n"); 00317 printf("%7d ", (int)sysErr[i]); 00318 } 00319 printf("\n"); 00320 #endif 00321 00322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n", 00323 is_idct ? "IDCT" : "DCT", 00324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax); 00325 #if 1 //Speed test 00326 /* speed test */ 00327 for(i=0;i<64;i++) 00328 block1[i] = 0; 00329 switch(test){ 00330 case 0: 00331 for(i=0;i<64;i++) 00332 block1[i] = av_lfg_get(&prng) % 512 -256; 00333 if (is_idct){ 00334 ff_ref_fdct(block1); 00335 00336 for(i=0;i<64;i++) 00337 block1[i]>>=3; 00338 } 00339 break; 00340 case 1:{ 00341 case 2: 00342 block1[0] = av_lfg_get(&prng) % 512 -256; 00343 block1[1] = av_lfg_get(&prng) % 512 -256; 00344 block1[2] = av_lfg_get(&prng) % 512 -256; 00345 block1[3] = av_lfg_get(&prng) % 512 -256; 00346 }break; 00347 } 00348 00349 if (form == MMX_PERM) { 00350 for(i=0;i<64;i++) 00351 block[idct_mmx_perm[i]] = block1[i]; 00352 } else if(form == MMX_SIMPLE_PERM) { 00353 for(i=0;i<64;i++) 00354 block[idct_simple_mmx_perm[i]] = block1[i]; 00355 } else { 00356 for(i=0; i<64; i++) 00357 block[i]= block1[i]; 00358 } 00359 00360 ti = gettime(); 00361 it1 = 0; 00362 do { 00363 for(it=0;it<NB_ITS_SPEED;it++) { 00364 for(i=0; i<64; i++) 00365 block[i]= block1[i]; 00366 // memcpy(block, block1, sizeof(DCTELEM) * 64); 00367 // do not memcpy especially not fastmemcpy because it does movntq !!! 00368 fdct_func(block); 00369 } 00370 it1 += NB_ITS_SPEED; 00371 ti1 = gettime() - ti; 00372 } while (ti1 < 1000000); 00373 mmx_emms(); 00374 00375 printf("%s %s: %0.1f kdct/s\n", 00376 is_idct ? "IDCT" : "DCT", 00377 name, (double)it1 * 1000.0 / (double)ti1); 00378 #endif 00379 } 00380 00381 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64]; 00382 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64]; 00383 00384 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block) 00385 { 00386 static int init; 00387 static double c8[8][8]; 00388 static double c4[4][4]; 00389 double block1[64], block2[64], block3[64]; 00390 double s, sum, v; 00391 int i, j, k; 00392 00393 if (!init) { 00394 init = 1; 00395 00396 for(i=0;i<8;i++) { 00397 sum = 0; 00398 for(j=0;j<8;j++) { 00399 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0); 00400 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0); 00401 sum += c8[i][j] * c8[i][j]; 00402 } 00403 } 00404 00405 for(i=0;i<4;i++) { 00406 sum = 0; 00407 for(j=0;j<4;j++) { 00408 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0); 00409 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0); 00410 sum += c4[i][j] * c4[i][j]; 00411 } 00412 } 00413 } 00414 00415 /* butterfly */ 00416 s = 0.5 * sqrt(2.0); 00417 for(i=0;i<4;i++) { 00418 for(j=0;j<8;j++) { 00419 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s; 00420 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s; 00421 } 00422 } 00423 00424 /* idct8 on lines */ 00425 for(i=0;i<8;i++) { 00426 for(j=0;j<8;j++) { 00427 sum = 0; 00428 for(k=0;k<8;k++) 00429 sum += c8[k][j] * block1[8*i+k]; 00430 block2[8*i+j] = sum; 00431 } 00432 } 00433 00434 /* idct4 */ 00435 for(i=0;i<8;i++) { 00436 for(j=0;j<4;j++) { 00437 /* top */ 00438 sum = 0; 00439 for(k=0;k<4;k++) 00440 sum += c4[k][j] * block2[8*(2*k)+i]; 00441 block3[8*(2*j)+i] = sum; 00442 00443 /* bottom */ 00444 sum = 0; 00445 for(k=0;k<4;k++) 00446 sum += c4[k][j] * block2[8*(2*k+1)+i]; 00447 block3[8*(2*j+1)+i] = sum; 00448 } 00449 } 00450 00451 /* clamp and store the result */ 00452 for(i=0;i<8;i++) { 00453 for(j=0;j<8;j++) { 00454 v = block3[8*i+j]; 00455 if (v < 0) 00456 v = 0; 00457 else if (v > 255) 00458 v = 255; 00459 dest[i * linesize + j] = (int)rint(v); 00460 } 00461 } 00462 } 00463 00464 static void idct248_error(const char *name, 00465 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block)) 00466 { 00467 int it, i, it1, ti, ti1, err_max, v; 00468 00469 AVLFG prng; 00470 00471 av_lfg_init(&prng, 1); 00472 00473 /* just one test to see if code is correct (precision is less 00474 important here) */ 00475 err_max = 0; 00476 for(it=0;it<NB_ITS;it++) { 00477 00478 /* XXX: use forward transform to generate values */ 00479 for(i=0;i<64;i++) 00480 block1[i] = av_lfg_get(&prng) % 256 - 128; 00481 block1[0] += 1024; 00482 00483 for(i=0; i<64; i++) 00484 block[i]= block1[i]; 00485 idct248_ref(img_dest1, 8, block); 00486 00487 for(i=0; i<64; i++) 00488 block[i]= block1[i]; 00489 idct248_put(img_dest, 8, block); 00490 00491 for(i=0;i<64;i++) { 00492 v = abs((int)img_dest[i] - (int)img_dest1[i]); 00493 if (v == 255) 00494 printf("%d %d\n", img_dest[i], img_dest1[i]); 00495 if (v > err_max) 00496 err_max = v; 00497 } 00498 #if 0 00499 printf("ref=\n"); 00500 for(i=0;i<8;i++) { 00501 int j; 00502 for(j=0;j<8;j++) { 00503 printf(" %3d", img_dest1[i*8+j]); 00504 } 00505 printf("\n"); 00506 } 00507 00508 printf("out=\n"); 00509 for(i=0;i<8;i++) { 00510 int j; 00511 for(j=0;j<8;j++) { 00512 printf(" %3d", img_dest[i*8+j]); 00513 } 00514 printf("\n"); 00515 } 00516 #endif 00517 } 00518 printf("%s %s: err_inf=%d\n", 00519 1 ? "IDCT248" : "DCT248", 00520 name, err_max); 00521 00522 ti = gettime(); 00523 it1 = 0; 00524 do { 00525 for(it=0;it<NB_ITS_SPEED;it++) { 00526 for(i=0; i<64; i++) 00527 block[i]= block1[i]; 00528 // memcpy(block, block1, sizeof(DCTELEM) * 64); 00529 // do not memcpy especially not fastmemcpy because it does movntq !!! 00530 idct248_put(img_dest, 8, block); 00531 } 00532 it1 += NB_ITS_SPEED; 00533 ti1 = gettime() - ti; 00534 } while (ti1 < 1000000); 00535 mmx_emms(); 00536 00537 printf("%s %s: %0.1f kdct/s\n", 00538 1 ? "IDCT248" : "DCT248", 00539 name, (double)it1 * 1000.0 / (double)ti1); 00540 } 00541 00542 static void help(void) 00543 { 00544 printf("dct-test [-i] [<test-number>]\n" 00545 "test-number 0 -> test with random matrixes\n" 00546 " 1 -> test with random sparse matrixes\n" 00547 " 2 -> do 3. test from mpeg4 std\n" 00548 "-i test IDCT implementations\n" 00549 "-4 test IDCT248 implementations\n"); 00550 } 00551 00552 int main(int argc, char **argv) 00553 { 00554 int test_idct = 0, test_248_dct = 0; 00555 int c,i; 00556 int test=1; 00557 cpu_flags = mm_support(); 00558 00559 ff_ref_dct_init(); 00560 idct_mmx_init(); 00561 00562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; 00563 for(i=0;i<MAX_NEG_CROP;i++) { 00564 cropTbl[i] = 0; 00565 cropTbl[i + MAX_NEG_CROP + 256] = 255; 00566 } 00567 00568 for(;;) { 00569 c = getopt(argc, argv, "ih4"); 00570 if (c == -1) 00571 break; 00572 switch(c) { 00573 case 'i': 00574 test_idct = 1; 00575 break; 00576 case '4': 00577 test_248_dct = 1; 00578 break; 00579 default : 00580 case 'h': 00581 help(); 00582 return 0; 00583 } 00584 } 00585 00586 if(optind <argc) test= atoi(argv[optind]); 00587 00588 printf("ffmpeg DCT/IDCT test\n"); 00589 00590 if (test_248_dct) { 00591 idct248_error("SIMPLE-C", ff_simple_idct248_put); 00592 } else { 00593 for (i=0;algos[i].name;i++) 00594 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) { 00595 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test); 00596 } 00597 } 00598 return 0; 00599 }