Libav 0.7.1
|
00001 /* 00002 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) 00003 * 00004 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> 00005 * 00006 * This file is part of Libav. 00007 * 00008 * Libav is free software; you can redistribute it and/or modify 00009 * it under the terms of the GNU General Public License as published by 00010 * the Free Software Foundation; either version 2 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * Libav is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU General Public License 00019 * along with Libav; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00028 /* 00029 C MMX MMX2 3DNow AltiVec 00030 isVertDC Ec Ec Ec 00031 isVertMinMaxOk Ec Ec Ec 00032 doVertLowPass E e e Ec 00033 doVertDefFilter Ec Ec e e Ec 00034 isHorizDC Ec Ec Ec 00035 isHorizMinMaxOk a E Ec 00036 doHorizLowPass E e e Ec 00037 doHorizDefFilter Ec Ec e e Ec 00038 do_a_deblock Ec E Ec E 00039 deRing E e e* Ecp 00040 Vertical RKAlgo1 E a a 00041 Horizontal RKAlgo1 a a 00042 Vertical X1# a E E 00043 Horizontal X1# a E E 00044 LinIpolDeinterlace e E E* 00045 CubicIpolDeinterlace a e e* 00046 LinBlendDeinterlace e E E* 00047 MedianDeinterlace# E Ec Ec 00048 TempDeNoiser# E e e Ec 00049 00050 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work 00051 # more or less selfinvented filters so the exactness is not too meaningful 00052 E = Exact implementation 00053 e = almost exact implementation (slightly different rounding,...) 00054 a = alternative / approximate impl 00055 c = checked against the other implementations (-vo md5) 00056 p = partially optimized, still some work to do 00057 */ 00058 00059 /* 00060 TODO: 00061 reduce the time wasted on the mem transfer 00062 unroll stuff if instructions depend too much on the prior one 00063 move YScale thing to the end instead of fixing QP 00064 write a faster and higher quality deblocking filter :) 00065 make the mainloop more flexible (variable number of blocks at once 00066 (the if/else stuff per block is slowing things down) 00067 compare the quality & speed of all filters 00068 split this huge file 00069 optimize c versions 00070 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 00071 ... 00072 */ 00073 00074 //Changelog: use git log 00075 00076 #include "config.h" 00077 #include "libavutil/avutil.h" 00078 #include <inttypes.h> 00079 #include <stdio.h> 00080 #include <stdlib.h> 00081 #include <string.h> 00082 //#undef HAVE_MMX2 00083 //#define HAVE_AMD3DNOW 00084 //#undef HAVE_MMX 00085 //#undef ARCH_X86 00086 //#define DEBUG_BRIGHTNESS 00087 #include "postprocess.h" 00088 #include "postprocess_internal.h" 00089 #include "libavutil/avstring.h" 00090 00091 unsigned postproc_version(void) 00092 { 00093 return LIBPOSTPROC_VERSION_INT; 00094 } 00095 00096 const char *postproc_configuration(void) 00097 { 00098 return LIBAV_CONFIGURATION; 00099 } 00100 00101 const char *postproc_license(void) 00102 { 00103 #define LICENSE_PREFIX "libpostproc license: " 00104 return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1; 00105 } 00106 00107 #if HAVE_ALTIVEC_H 00108 #include <altivec.h> 00109 #endif 00110 00111 #define GET_MODE_BUFFER_SIZE 500 00112 #define OPTIONS_ARRAY_SIZE 10 00113 #define BLOCK_SIZE 8 00114 #define TEMP_STRIDE 8 00115 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet 00116 00117 #if ARCH_X86 00118 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL; 00119 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL; 00120 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL; 00121 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL; 00122 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL; 00123 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL; 00124 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL; 00125 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL; 00126 #endif 00127 00128 DECLARE_ASM_CONST(8, int, deringThreshold)= 20; 00129 00130 00131 static struct PPFilter filters[]= 00132 { 00133 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, 00134 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, 00135 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, 00136 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ 00137 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, 00138 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, 00139 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK}, 00140 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK}, 00141 {"dr", "dering", 1, 5, 6, DERING}, 00142 {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, 00143 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, 00144 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, 00145 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, 00146 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, 00147 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, 00148 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER}, 00149 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, 00150 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, 00151 {NULL, NULL,0,0,0,0} //End Marker 00152 }; 00153 00154 static const char *replaceTable[]= 00155 { 00156 "default", "hb:a,vb:a,dr:a", 00157 "de", "hb:a,vb:a,dr:a", 00158 "fast", "h1:a,v1:a,dr:a", 00159 "fa", "h1:a,v1:a,dr:a", 00160 "ac", "ha:a:128:7,va:a,dr:a", 00161 NULL //End Marker 00162 }; 00163 00164 00165 #if ARCH_X86 00166 static inline void prefetchnta(void *p) 00167 { 00168 __asm__ volatile( "prefetchnta (%0)\n\t" 00169 : : "r" (p) 00170 ); 00171 } 00172 00173 static inline void prefetcht0(void *p) 00174 { 00175 __asm__ volatile( "prefetcht0 (%0)\n\t" 00176 : : "r" (p) 00177 ); 00178 } 00179 00180 static inline void prefetcht1(void *p) 00181 { 00182 __asm__ volatile( "prefetcht1 (%0)\n\t" 00183 : : "r" (p) 00184 ); 00185 } 00186 00187 static inline void prefetcht2(void *p) 00188 { 00189 __asm__ volatile( "prefetcht2 (%0)\n\t" 00190 : : "r" (p) 00191 ); 00192 } 00193 #endif 00194 00195 /* The horizontal functions exist only in C because the MMX 00196 * code is faster with vertical filters and transposing. */ 00197 00201 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c) 00202 { 00203 int numEq= 0; 00204 int y; 00205 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00206 const int dcThreshold= dcOffset*2 + 1; 00207 00208 for(y=0; y<BLOCK_SIZE; y++){ 00209 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; 00210 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; 00211 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; 00212 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; 00213 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; 00214 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; 00215 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; 00216 src+= stride; 00217 } 00218 return numEq > c->ppMode.flatnessThreshold; 00219 } 00220 00224 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c) 00225 { 00226 int numEq= 0; 00227 int y; 00228 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00229 const int dcThreshold= dcOffset*2 + 1; 00230 00231 src+= stride*4; // src points to begin of the 8x8 Block 00232 for(y=0; y<BLOCK_SIZE-1; y++){ 00233 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; 00234 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; 00235 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; 00236 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; 00237 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; 00238 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; 00239 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; 00240 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; 00241 src+= stride; 00242 } 00243 return numEq > c->ppMode.flatnessThreshold; 00244 } 00245 00246 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP) 00247 { 00248 int i; 00249 #if 1 00250 for(i=0; i<2; i++){ 00251 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; 00252 src += stride; 00253 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; 00254 src += stride; 00255 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; 00256 src += stride; 00257 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; 00258 src += stride; 00259 } 00260 #else 00261 for(i=0; i<8; i++){ 00262 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0; 00263 src += stride; 00264 } 00265 #endif 00266 return 1; 00267 } 00268 00269 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP) 00270 { 00271 #if 1 00272 #if 1 00273 int x; 00274 src+= stride*4; 00275 for(x=0; x<BLOCK_SIZE; x+=4){ 00276 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; 00277 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; 00278 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; 00279 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; 00280 } 00281 #else 00282 int x; 00283 src+= stride*3; 00284 for(x=0; x<BLOCK_SIZE; x++){ 00285 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; 00286 } 00287 #endif 00288 return 1; 00289 #else 00290 int x; 00291 src+= stride*4; 00292 for(x=0; x<BLOCK_SIZE; x++){ 00293 int min=255; 00294 int max=0; 00295 int y; 00296 for(y=0; y<8; y++){ 00297 int v= src[x + y*stride]; 00298 if(v>max) max=v; 00299 if(v<min) min=v; 00300 } 00301 if(max-min > 2*QP) return 0; 00302 } 00303 return 1; 00304 #endif 00305 } 00306 00307 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c) 00308 { 00309 if( isHorizDC_C(src, stride, c) ){ 00310 if( isHorizMinMaxOk_C(src, stride, c->QP) ) 00311 return 1; 00312 else 00313 return 0; 00314 }else{ 00315 return 2; 00316 } 00317 } 00318 00319 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c) 00320 { 00321 if( isVertDC_C(src, stride, c) ){ 00322 if( isVertMinMaxOk_C(src, stride, c->QP) ) 00323 return 1; 00324 else 00325 return 0; 00326 }else{ 00327 return 2; 00328 } 00329 } 00330 00331 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c) 00332 { 00333 int y; 00334 for(y=0; y<BLOCK_SIZE; y++){ 00335 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); 00336 00337 if(FFABS(middleEnergy) < 8*c->QP){ 00338 const int q=(dst[3] - dst[4])/2; 00339 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); 00340 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); 00341 00342 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00343 d= FFMAX(d, 0); 00344 00345 d= (5*d + 32) >> 6; 00346 d*= FFSIGN(-middleEnergy); 00347 00348 if(q>0) 00349 { 00350 d= d<0 ? 0 : d; 00351 d= d>q ? q : d; 00352 } 00353 else 00354 { 00355 d= d>0 ? 0 : d; 00356 d= d<q ? q : d; 00357 } 00358 00359 dst[3]-= d; 00360 dst[4]+= d; 00361 } 00362 dst+= stride; 00363 } 00364 } 00365 00370 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c) 00371 { 00372 int y; 00373 for(y=0; y<BLOCK_SIZE; y++){ 00374 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; 00375 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; 00376 00377 int sums[10]; 00378 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; 00379 sums[1] = sums[0] - first + dst[3]; 00380 sums[2] = sums[1] - first + dst[4]; 00381 sums[3] = sums[2] - first + dst[5]; 00382 sums[4] = sums[3] - first + dst[6]; 00383 sums[5] = sums[4] - dst[0] + dst[7]; 00384 sums[6] = sums[5] - dst[1] + last; 00385 sums[7] = sums[6] - dst[2] + last; 00386 sums[8] = sums[7] - dst[3] + last; 00387 sums[9] = sums[8] - dst[4] + last; 00388 00389 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; 00390 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; 00391 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; 00392 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; 00393 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; 00394 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; 00395 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; 00396 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; 00397 00398 dst+= stride; 00399 } 00400 } 00401 00410 static inline void horizX1Filter(uint8_t *src, int stride, int QP) 00411 { 00412 int y; 00413 static uint64_t *lut= NULL; 00414 if(lut==NULL) 00415 { 00416 int i; 00417 lut = av_malloc(256*8); 00418 for(i=0; i<256; i++) 00419 { 00420 int v= i < 128 ? 2*i : 2*(i-256); 00421 /* 00422 //Simulate 112242211 9-Tap filter 00423 uint64_t a= (v/16) & 0xFF; 00424 uint64_t b= (v/8) & 0xFF; 00425 uint64_t c= (v/4) & 0xFF; 00426 uint64_t d= (3*v/8) & 0xFF; 00427 */ 00428 //Simulate piecewise linear interpolation 00429 uint64_t a= (v/16) & 0xFF; 00430 uint64_t b= (v*3/16) & 0xFF; 00431 uint64_t c= (v*5/16) & 0xFF; 00432 uint64_t d= (7*v/16) & 0xFF; 00433 uint64_t A= (0x100 - a)&0xFF; 00434 uint64_t B= (0x100 - b)&0xFF; 00435 uint64_t C= (0x100 - c)&0xFF; 00436 uint64_t D= (0x100 - c)&0xFF; 00437 00438 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | 00439 (D<<24) | (C<<16) | (B<<8) | (A); 00440 //lut[i] = (v<<32) | (v<<24); 00441 } 00442 } 00443 00444 for(y=0; y<BLOCK_SIZE; y++){ 00445 int a= src[1] - src[2]; 00446 int b= src[3] - src[4]; 00447 int c= src[5] - src[6]; 00448 00449 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); 00450 00451 if(d < QP){ 00452 int v = d * FFSIGN(-b); 00453 00454 src[1] +=v/8; 00455 src[2] +=v/4; 00456 src[3] +=3*v/8; 00457 src[4] -=3*v/8; 00458 src[5] -=v/4; 00459 src[6] -=v/8; 00460 } 00461 src+=stride; 00462 } 00463 } 00464 00468 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){ 00469 int y; 00470 const int QP= c->QP; 00471 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00472 const int dcThreshold= dcOffset*2 + 1; 00473 //START_TIMER 00474 src+= step*4; // src points to begin of the 8x8 Block 00475 for(y=0; y<8; y++){ 00476 int numEq= 0; 00477 00478 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++; 00479 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++; 00480 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++; 00481 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++; 00482 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++; 00483 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++; 00484 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++; 00485 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++; 00486 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++; 00487 if(numEq > c->ppMode.flatnessThreshold){ 00488 int min, max, x; 00489 00490 if(src[0] > src[step]){ 00491 max= src[0]; 00492 min= src[step]; 00493 }else{ 00494 max= src[step]; 00495 min= src[0]; 00496 } 00497 for(x=2; x<8; x+=2){ 00498 if(src[x*step] > src[(x+1)*step]){ 00499 if(src[x *step] > max) max= src[ x *step]; 00500 if(src[(x+1)*step] < min) min= src[(x+1)*step]; 00501 }else{ 00502 if(src[(x+1)*step] > max) max= src[(x+1)*step]; 00503 if(src[ x *step] < min) min= src[ x *step]; 00504 } 00505 } 00506 if(max-min < 2*QP){ 00507 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; 00508 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; 00509 00510 int sums[10]; 00511 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; 00512 sums[1] = sums[0] - first + src[3*step]; 00513 sums[2] = sums[1] - first + src[4*step]; 00514 sums[3] = sums[2] - first + src[5*step]; 00515 sums[4] = sums[3] - first + src[6*step]; 00516 sums[5] = sums[4] - src[0*step] + src[7*step]; 00517 sums[6] = sums[5] - src[1*step] + last; 00518 sums[7] = sums[6] - src[2*step] + last; 00519 sums[8] = sums[7] - src[3*step] + last; 00520 sums[9] = sums[8] - src[4*step] + last; 00521 00522 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; 00523 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; 00524 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; 00525 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; 00526 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; 00527 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; 00528 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; 00529 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; 00530 } 00531 }else{ 00532 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); 00533 00534 if(FFABS(middleEnergy) < 8*QP){ 00535 const int q=(src[3*step] - src[4*step])/2; 00536 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); 00537 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); 00538 00539 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00540 d= FFMAX(d, 0); 00541 00542 d= (5*d + 32) >> 6; 00543 d*= FFSIGN(-middleEnergy); 00544 00545 if(q>0){ 00546 d= d<0 ? 0 : d; 00547 d= d>q ? q : d; 00548 }else{ 00549 d= d>0 ? 0 : d; 00550 d= d<q ? q : d; 00551 } 00552 00553 src[3*step]-= d; 00554 src[4*step]+= d; 00555 } 00556 } 00557 00558 src += stride; 00559 } 00560 /*if(step==16){ 00561 STOP_TIMER("step16") 00562 }else{ 00563 STOP_TIMER("stepX") 00564 }*/ 00565 } 00566 00567 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one 00568 //Plain C versions 00569 #if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT 00570 #define COMPILE_C 00571 #endif 00572 00573 #if HAVE_ALTIVEC 00574 #define COMPILE_ALTIVEC 00575 #endif //HAVE_ALTIVEC 00576 00577 #if ARCH_X86 00578 00579 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 00580 #define COMPILE_MMX 00581 #endif 00582 00583 #if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT 00584 #define COMPILE_MMX2 00585 #endif 00586 00587 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 00588 #define COMPILE_3DNOW 00589 #endif 00590 #endif /* ARCH_X86 */ 00591 00592 #undef HAVE_MMX 00593 #define HAVE_MMX 0 00594 #undef HAVE_MMX2 00595 #define HAVE_MMX2 0 00596 #undef HAVE_AMD3DNOW 00597 #define HAVE_AMD3DNOW 0 00598 #undef HAVE_ALTIVEC 00599 #define HAVE_ALTIVEC 0 00600 00601 #ifdef COMPILE_C 00602 #define RENAME(a) a ## _C 00603 #include "postprocess_template.c" 00604 #endif 00605 00606 #ifdef COMPILE_ALTIVEC 00607 #undef RENAME 00608 #undef HAVE_ALTIVEC 00609 #define HAVE_ALTIVEC 1 00610 #define RENAME(a) a ## _altivec 00611 #include "postprocess_altivec_template.c" 00612 #include "postprocess_template.c" 00613 #endif 00614 00615 //MMX versions 00616 #ifdef COMPILE_MMX 00617 #undef RENAME 00618 #undef HAVE_MMX 00619 #define HAVE_MMX 1 00620 #define RENAME(a) a ## _MMX 00621 #include "postprocess_template.c" 00622 #endif 00623 00624 //MMX2 versions 00625 #ifdef COMPILE_MMX2 00626 #undef RENAME 00627 #undef HAVE_MMX 00628 #undef HAVE_MMX2 00629 #define HAVE_MMX 1 00630 #define HAVE_MMX2 1 00631 #define RENAME(a) a ## _MMX2 00632 #include "postprocess_template.c" 00633 #endif 00634 00635 //3DNOW versions 00636 #ifdef COMPILE_3DNOW 00637 #undef RENAME 00638 #undef HAVE_MMX 00639 #undef HAVE_MMX2 00640 #undef HAVE_AMD3DNOW 00641 #define HAVE_MMX 1 00642 #define HAVE_MMX2 0 00643 #define HAVE_AMD3DNOW 1 00644 #define RENAME(a) a ## _3DNow 00645 #include "postprocess_template.c" 00646 #endif 00647 00648 // minor note: the HAVE_xyz is messed up after that line so do not use it. 00649 00650 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 00651 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc) 00652 { 00653 PPContext *c= (PPContext *)vc; 00654 PPMode *ppMode= (PPMode *)vm; 00655 c->ppMode= *ppMode; //FIXME 00656 00657 // Using ifs here as they are faster than function pointers although the 00658 // difference would not be measurable here but it is much better because 00659 // someone might exchange the CPU whithout restarting MPlayer ;) 00660 #if CONFIG_RUNTIME_CPUDETECT 00661 #if ARCH_X86 00662 // ordered per speed fastest first 00663 if(c->cpuCaps & PP_CPU_CAPS_MMX2) 00664 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00665 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) 00666 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00667 else if(c->cpuCaps & PP_CPU_CAPS_MMX) 00668 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00669 else 00670 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00671 #else 00672 #if HAVE_ALTIVEC 00673 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC) 00674 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00675 else 00676 #endif 00677 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00678 #endif 00679 #else //CONFIG_RUNTIME_CPUDETECT 00680 #if HAVE_MMX2 00681 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00682 #elif HAVE_AMD3DNOW 00683 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00684 #elif HAVE_MMX 00685 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00686 #elif HAVE_ALTIVEC 00687 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00688 #else 00689 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00690 #endif 00691 #endif //!CONFIG_RUNTIME_CPUDETECT 00692 } 00693 00694 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 00695 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 00696 00697 /* -pp Command line Help 00698 */ 00699 const char pp_help[] = 00700 "Available postprocessing filters:\n" 00701 "Filters Options\n" 00702 "short long name short long option Description\n" 00703 "* * a autoq CPU power dependent enabler\n" 00704 " c chrom chrominance filtering enabled\n" 00705 " y nochrom chrominance filtering disabled\n" 00706 " n noluma luma filtering disabled\n" 00707 "hb hdeblock (2 threshold) horizontal deblocking filter\n" 00708 " 1. difference factor: default=32, higher -> more deblocking\n" 00709 " 2. flatness threshold: default=39, lower -> more deblocking\n" 00710 " the h & v deblocking filters share these\n" 00711 " so you can't set different thresholds for h / v\n" 00712 "vb vdeblock (2 threshold) vertical deblocking filter\n" 00713 "ha hadeblock (2 threshold) horizontal deblocking filter\n" 00714 "va vadeblock (2 threshold) vertical deblocking filter\n" 00715 "h1 x1hdeblock experimental h deblock filter 1\n" 00716 "v1 x1vdeblock experimental v deblock filter 1\n" 00717 "dr dering deringing filter\n" 00718 "al autolevels automatic brightness / contrast\n" 00719 " f fullyrange stretch luminance to (0..255)\n" 00720 "lb linblenddeint linear blend deinterlacer\n" 00721 "li linipoldeint linear interpolating deinterlace\n" 00722 "ci cubicipoldeint cubic interpolating deinterlacer\n" 00723 "md mediandeint median deinterlacer\n" 00724 "fd ffmpegdeint ffmpeg deinterlacer\n" 00725 "l5 lowpass5 FIR lowpass deinterlacer\n" 00726 "de default hb:a,vb:a,dr:a\n" 00727 "fa fast h1:a,v1:a,dr:a\n" 00728 "ac ha:a:128:7,va:a,dr:a\n" 00729 "tn tmpnoise (3 threshold) temporal noise reducer\n" 00730 " 1. <= 2. <= 3. larger -> stronger filtering\n" 00731 "fq forceQuant <quantizer> force quantizer\n" 00732 "Usage:\n" 00733 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" 00734 "long form example:\n" 00735 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" 00736 "short form example:\n" 00737 "vb:a/hb:a/lb de,-vb\n" 00738 "more examples:\n" 00739 "tn:64:128:256\n" 00740 "\n" 00741 ; 00742 00743 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality) 00744 { 00745 char temp[GET_MODE_BUFFER_SIZE]; 00746 char *p= temp; 00747 static const char filterDelimiters[] = ",/"; 00748 static const char optionDelimiters[] = ":"; 00749 struct PPMode *ppMode; 00750 char *filterToken; 00751 00752 ppMode= av_malloc(sizeof(PPMode)); 00753 00754 ppMode->lumMode= 0; 00755 ppMode->chromMode= 0; 00756 ppMode->maxTmpNoise[0]= 700; 00757 ppMode->maxTmpNoise[1]= 1500; 00758 ppMode->maxTmpNoise[2]= 3000; 00759 ppMode->maxAllowedY= 234; 00760 ppMode->minAllowedY= 16; 00761 ppMode->baseDcDiff= 256/8; 00762 ppMode->flatnessThreshold= 56-16-1; 00763 ppMode->maxClippedThreshold= 0.01; 00764 ppMode->error=0; 00765 00766 memset(temp, 0, GET_MODE_BUFFER_SIZE); 00767 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1); 00768 00769 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name); 00770 00771 for(;;){ 00772 char *filterName; 00773 int q= 1000000; //PP_QUALITY_MAX; 00774 int chrom=-1; 00775 int luma=-1; 00776 char *option; 00777 char *options[OPTIONS_ARRAY_SIZE]; 00778 int i; 00779 int filterNameOk=0; 00780 int numOfUnknownOptions=0; 00781 int enable=1; //does the user want us to enabled or disabled the filter 00782 00783 filterToken= strtok(p, filterDelimiters); 00784 if(filterToken == NULL) break; 00785 p+= strlen(filterToken) + 1; // p points to next filterToken 00786 filterName= strtok(filterToken, optionDelimiters); 00787 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName); 00788 00789 if(*filterName == '-'){ 00790 enable=0; 00791 filterName++; 00792 } 00793 00794 for(;;){ //for all options 00795 option= strtok(NULL, optionDelimiters); 00796 if(option == NULL) break; 00797 00798 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option); 00799 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; 00800 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; 00801 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; 00802 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0; 00803 else{ 00804 options[numOfUnknownOptions] = option; 00805 numOfUnknownOptions++; 00806 } 00807 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; 00808 } 00809 options[numOfUnknownOptions] = NULL; 00810 00811 /* replace stuff from the replace Table */ 00812 for(i=0; replaceTable[2*i]!=NULL; i++){ 00813 if(!strcmp(replaceTable[2*i], filterName)){ 00814 int newlen= strlen(replaceTable[2*i + 1]); 00815 int plen; 00816 int spaceLeft; 00817 00818 if(p==NULL) p= temp, *p=0; //last filter 00819 else p--, *p=','; //not last filter 00820 00821 plen= strlen(p); 00822 spaceLeft= p - temp + plen; 00823 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){ 00824 ppMode->error++; 00825 break; 00826 } 00827 memmove(p + newlen, p, plen+1); 00828 memcpy(p, replaceTable[2*i + 1], newlen); 00829 filterNameOk=1; 00830 } 00831 } 00832 00833 for(i=0; filters[i].shortName!=NULL; i++){ 00834 if( !strcmp(filters[i].longName, filterName) 00835 || !strcmp(filters[i].shortName, filterName)){ 00836 ppMode->lumMode &= ~filters[i].mask; 00837 ppMode->chromMode &= ~filters[i].mask; 00838 00839 filterNameOk=1; 00840 if(!enable) break; // user wants to disable it 00841 00842 if(q >= filters[i].minLumQuality && luma) 00843 ppMode->lumMode|= filters[i].mask; 00844 if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) 00845 if(q >= filters[i].minChromQuality) 00846 ppMode->chromMode|= filters[i].mask; 00847 00848 if(filters[i].mask == LEVEL_FIX){ 00849 int o; 00850 ppMode->minAllowedY= 16; 00851 ppMode->maxAllowedY= 234; 00852 for(o=0; options[o]!=NULL; o++){ 00853 if( !strcmp(options[o],"fullyrange") 00854 ||!strcmp(options[o],"f")){ 00855 ppMode->minAllowedY= 0; 00856 ppMode->maxAllowedY= 255; 00857 numOfUnknownOptions--; 00858 } 00859 } 00860 } 00861 else if(filters[i].mask == TEMP_NOISE_FILTER) 00862 { 00863 int o; 00864 int numOfNoises=0; 00865 00866 for(o=0; options[o]!=NULL; o++){ 00867 char *tail; 00868 ppMode->maxTmpNoise[numOfNoises]= 00869 strtol(options[o], &tail, 0); 00870 if(tail!=options[o]){ 00871 numOfNoises++; 00872 numOfUnknownOptions--; 00873 if(numOfNoises >= 3) break; 00874 } 00875 } 00876 } 00877 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK 00878 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){ 00879 int o; 00880 00881 for(o=0; options[o]!=NULL && o<2; o++){ 00882 char *tail; 00883 int val= strtol(options[o], &tail, 0); 00884 if(tail==options[o]) break; 00885 00886 numOfUnknownOptions--; 00887 if(o==0) ppMode->baseDcDiff= val; 00888 else ppMode->flatnessThreshold= val; 00889 } 00890 } 00891 else if(filters[i].mask == FORCE_QUANT){ 00892 int o; 00893 ppMode->forcedQuant= 15; 00894 00895 for(o=0; options[o]!=NULL && o<1; o++){ 00896 char *tail; 00897 int val= strtol(options[o], &tail, 0); 00898 if(tail==options[o]) break; 00899 00900 numOfUnknownOptions--; 00901 ppMode->forcedQuant= val; 00902 } 00903 } 00904 } 00905 } 00906 if(!filterNameOk) ppMode->error++; 00907 ppMode->error += numOfUnknownOptions; 00908 } 00909 00910 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); 00911 if(ppMode->error){ 00912 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); 00913 av_free(ppMode); 00914 return NULL; 00915 } 00916 return ppMode; 00917 } 00918 00919 void pp_free_mode(pp_mode *mode){ 00920 av_free(mode); 00921 } 00922 00923 static void reallocAlign(void **p, int alignment, int size){ 00924 av_free(*p); 00925 *p= av_mallocz(size); 00926 } 00927 00928 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){ 00929 int mbWidth = (width+15)>>4; 00930 int mbHeight= (height+15)>>4; 00931 int i; 00932 00933 c->stride= stride; 00934 c->qpStride= qpStride; 00935 00936 reallocAlign((void **)&c->tempDst, 8, stride*24); 00937 reallocAlign((void **)&c->tempSrc, 8, stride*24); 00938 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); 00939 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); 00940 for(i=0; i<256; i++) 00941 c->yHistogram[i]= width*height/64*15/256; 00942 00943 for(i=0; i<3; i++){ 00944 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end. 00945 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024); 00946 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size 00947 } 00948 00949 reallocAlign((void **)&c->deintTemp, 8, 2*width+32); 00950 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 00951 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 00952 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); 00953 } 00954 00955 static const char * context_to_name(void * ptr) { 00956 return "postproc"; 00957 } 00958 00959 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL }; 00960 00961 pp_context *pp_get_context(int width, int height, int cpuCaps){ 00962 PPContext *c= av_malloc(sizeof(PPContext)); 00963 int stride= FFALIGN(width, 16); //assumed / will realloc if needed 00964 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed 00965 00966 memset(c, 0, sizeof(PPContext)); 00967 c->av_class = &av_codec_context_class; 00968 c->cpuCaps= cpuCaps; 00969 if(cpuCaps&PP_FORMAT){ 00970 c->hChromaSubSample= cpuCaps&0x3; 00971 c->vChromaSubSample= (cpuCaps>>4)&0x3; 00972 }else{ 00973 c->hChromaSubSample= 1; 00974 c->vChromaSubSample= 1; 00975 } 00976 00977 reallocBuffers(c, width, height, stride, qpStride); 00978 00979 c->frameNum=-1; 00980 00981 return c; 00982 } 00983 00984 void pp_free_context(void *vc){ 00985 PPContext *c = (PPContext*)vc; 00986 int i; 00987 00988 for(i=0; i<3; i++) av_free(c->tempBlurred[i]); 00989 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]); 00990 00991 av_free(c->tempBlocks); 00992 av_free(c->yHistogram); 00993 av_free(c->tempDst); 00994 av_free(c->tempSrc); 00995 av_free(c->deintTemp); 00996 av_free(c->stdQPTable); 00997 av_free(c->nonBQPTable); 00998 av_free(c->forcedQPTable); 00999 01000 memset(c, 0, sizeof(PPContext)); 01001 01002 av_free(c); 01003 } 01004 01005 void pp_postprocess(const uint8_t * src[3], const int srcStride[3], 01006 uint8_t * dst[3], const int dstStride[3], 01007 int width, int height, 01008 const QP_STORE_T *QP_store, int QPStride, 01009 pp_mode *vm, void *vc, int pict_type) 01010 { 01011 int mbWidth = (width+15)>>4; 01012 int mbHeight= (height+15)>>4; 01013 PPMode *mode = (PPMode*)vm; 01014 PPContext *c = (PPContext*)vc; 01015 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0])); 01016 int absQPStride = FFABS(QPStride); 01017 01018 // c->stride and c->QPStride are always positive 01019 if(c->stride < minStride || c->qpStride < absQPStride) 01020 reallocBuffers(c, width, height, 01021 FFMAX(minStride, c->stride), 01022 FFMAX(c->qpStride, absQPStride)); 01023 01024 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){ 01025 int i; 01026 QP_store= c->forcedQPTable; 01027 absQPStride = QPStride = 0; 01028 if(mode->lumMode & FORCE_QUANT) 01029 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant; 01030 else 01031 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1; 01032 } 01033 01034 if(pict_type & PP_PICT_TYPE_QP2){ 01035 int i; 01036 const int count= mbHeight * absQPStride; 01037 for(i=0; i<(count>>2); i++){ 01038 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F; 01039 } 01040 for(i<<=2; i<count; i++){ 01041 c->stdQPTable[i] = QP_store[i]>>1; 01042 } 01043 QP_store= c->stdQPTable; 01044 QPStride= absQPStride; 01045 } 01046 01047 if(0){ 01048 int x,y; 01049 for(y=0; y<mbHeight; y++){ 01050 for(x=0; x<mbWidth; x++){ 01051 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]); 01052 } 01053 av_log(c, AV_LOG_INFO, "\n"); 01054 } 01055 av_log(c, AV_LOG_INFO, "\n"); 01056 } 01057 01058 if((pict_type&7)!=3){ 01059 if (QPStride >= 0){ 01060 int i; 01061 const int count= mbHeight * QPStride; 01062 for(i=0; i<(count>>2); i++){ 01063 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F; 01064 } 01065 for(i<<=2; i<count; i++){ 01066 c->nonBQPTable[i] = QP_store[i] & 0x3F; 01067 } 01068 } else { 01069 int i,j; 01070 for(i=0; i<mbHeight; i++) { 01071 for(j=0; j<absQPStride; j++) { 01072 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F; 01073 } 01074 } 01075 } 01076 } 01077 01078 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n", 01079 mode->lumMode, mode->chromMode); 01080 01081 postProcess(src[0], srcStride[0], dst[0], dstStride[0], 01082 width, height, QP_store, QPStride, 0, mode, c); 01083 01084 width = (width )>>c->hChromaSubSample; 01085 height = (height)>>c->vChromaSubSample; 01086 01087 if(mode->chromMode){ 01088 postProcess(src[1], srcStride[1], dst[1], dstStride[1], 01089 width, height, QP_store, QPStride, 1, mode, c); 01090 postProcess(src[2], srcStride[2], dst[2], dstStride[2], 01091 width, height, QP_store, QPStride, 2, mode, c); 01092 } 01093 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){ 01094 linecpy(dst[1], src[1], height, srcStride[1]); 01095 linecpy(dst[2], src[2], height, srcStride[2]); 01096 }else{ 01097 int y; 01098 for(y=0; y<height; y++){ 01099 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); 01100 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); 01101 } 01102 } 01103 }