Libav
|
00001 /* 00002 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) 00003 * 00004 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> 00005 * 00006 * This file is part of FFmpeg. 00007 * 00008 * FFmpeg is free software; you can redistribute it and/or modify 00009 * it under the terms of the GNU General Public License as published by 00010 * the Free Software Foundation; either version 2 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * FFmpeg is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU General Public License 00019 * along with FFmpeg; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00028 /* 00029 C MMX MMX2 3DNow AltiVec 00030 isVertDC Ec Ec Ec 00031 isVertMinMaxOk Ec Ec Ec 00032 doVertLowPass E e e Ec 00033 doVertDefFilter Ec Ec e e Ec 00034 isHorizDC Ec Ec Ec 00035 isHorizMinMaxOk a E Ec 00036 doHorizLowPass E e e Ec 00037 doHorizDefFilter Ec Ec e e Ec 00038 do_a_deblock Ec E Ec E 00039 deRing E e e* Ecp 00040 Vertical RKAlgo1 E a a 00041 Horizontal RKAlgo1 a a 00042 Vertical X1# a E E 00043 Horizontal X1# a E E 00044 LinIpolDeinterlace e E E* 00045 CubicIpolDeinterlace a e e* 00046 LinBlendDeinterlace e E E* 00047 MedianDeinterlace# E Ec Ec 00048 TempDeNoiser# E e e Ec 00049 00050 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work 00051 # more or less selfinvented filters so the exactness is not too meaningful 00052 E = Exact implementation 00053 e = almost exact implementation (slightly different rounding,...) 00054 a = alternative / approximate impl 00055 c = checked against the other implementations (-vo md5) 00056 p = partially optimized, still some work to do 00057 */ 00058 00059 /* 00060 TODO: 00061 reduce the time wasted on the mem transfer 00062 unroll stuff if instructions depend too much on the prior one 00063 move YScale thing to the end instead of fixing QP 00064 write a faster and higher quality deblocking filter :) 00065 make the mainloop more flexible (variable number of blocks at once 00066 (the if/else stuff per block is slowing things down) 00067 compare the quality & speed of all filters 00068 split this huge file 00069 optimize c versions 00070 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 00071 ... 00072 */ 00073 00074 //Changelog: use the Subversion log 00075 00076 #include "config.h" 00077 #include "libavutil/avutil.h" 00078 #include <inttypes.h> 00079 #include <stdio.h> 00080 #include <stdlib.h> 00081 #include <string.h> 00082 //#undef HAVE_MMX2 00083 //#define HAVE_AMD3DNOW 00084 //#undef HAVE_MMX 00085 //#undef ARCH_X86 00086 //#define DEBUG_BRIGHTNESS 00087 #include "postprocess.h" 00088 #include "postprocess_internal.h" 00089 #include "libavutil/avstring.h" 00090 00091 unsigned postproc_version(void) 00092 { 00093 return LIBPOSTPROC_VERSION_INT; 00094 } 00095 00096 const char *postproc_configuration(void) 00097 { 00098 return FFMPEG_CONFIGURATION; 00099 } 00100 00101 const char *postproc_license(void) 00102 { 00103 #define LICENSE_PREFIX "libpostproc license: " 00104 return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1; 00105 } 00106 00107 #if HAVE_ALTIVEC_H 00108 #include <altivec.h> 00109 #endif 00110 00111 #define GET_MODE_BUFFER_SIZE 500 00112 #define OPTIONS_ARRAY_SIZE 10 00113 #define BLOCK_SIZE 8 00114 #define TEMP_STRIDE 8 00115 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet 00116 00117 #if ARCH_X86 00118 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL; 00119 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL; 00120 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL; 00121 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL; 00122 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL; 00123 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL; 00124 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL; 00125 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL; 00126 #endif 00127 00128 DECLARE_ASM_CONST(8, int, deringThreshold)= 20; 00129 00130 00131 static struct PPFilter filters[]= 00132 { 00133 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, 00134 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, 00135 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, 00136 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ 00137 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, 00138 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, 00139 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK}, 00140 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK}, 00141 {"dr", "dering", 1, 5, 6, DERING}, 00142 {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, 00143 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, 00144 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, 00145 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, 00146 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, 00147 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, 00148 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER}, 00149 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, 00150 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, 00151 {NULL, NULL,0,0,0,0} //End Marker 00152 }; 00153 00154 static const char *replaceTable[]= 00155 { 00156 "default", "hb:a,vb:a,dr:a", 00157 "de", "hb:a,vb:a,dr:a", 00158 "fast", "h1:a,v1:a,dr:a", 00159 "fa", "h1:a,v1:a,dr:a", 00160 "ac", "ha:a:128:7,va:a,dr:a", 00161 NULL //End Marker 00162 }; 00163 00164 00165 #if ARCH_X86 00166 static inline void prefetchnta(void *p) 00167 { 00168 __asm__ volatile( "prefetchnta (%0)\n\t" 00169 : : "r" (p) 00170 ); 00171 } 00172 00173 static inline void prefetcht0(void *p) 00174 { 00175 __asm__ volatile( "prefetcht0 (%0)\n\t" 00176 : : "r" (p) 00177 ); 00178 } 00179 00180 static inline void prefetcht1(void *p) 00181 { 00182 __asm__ volatile( "prefetcht1 (%0)\n\t" 00183 : : "r" (p) 00184 ); 00185 } 00186 00187 static inline void prefetcht2(void *p) 00188 { 00189 __asm__ volatile( "prefetcht2 (%0)\n\t" 00190 : : "r" (p) 00191 ); 00192 } 00193 #endif 00194 00195 /* The horizontal functions exist only in C because the MMX 00196 * code is faster with vertical filters and transposing. */ 00197 00201 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c) 00202 { 00203 int numEq= 0; 00204 int y; 00205 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00206 const int dcThreshold= dcOffset*2 + 1; 00207 00208 for(y=0; y<BLOCK_SIZE; y++){ 00209 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; 00210 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; 00211 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; 00212 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; 00213 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; 00214 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; 00215 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; 00216 src+= stride; 00217 } 00218 return numEq > c->ppMode.flatnessThreshold; 00219 } 00220 00224 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c) 00225 { 00226 int numEq= 0; 00227 int y; 00228 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00229 const int dcThreshold= dcOffset*2 + 1; 00230 00231 src+= stride*4; // src points to begin of the 8x8 Block 00232 for(y=0; y<BLOCK_SIZE-1; y++){ 00233 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; 00234 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; 00235 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; 00236 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; 00237 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; 00238 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; 00239 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; 00240 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; 00241 src+= stride; 00242 } 00243 return numEq > c->ppMode.flatnessThreshold; 00244 } 00245 00246 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP) 00247 { 00248 int i; 00249 #if 1 00250 for(i=0; i<2; i++){ 00251 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; 00252 src += stride; 00253 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; 00254 src += stride; 00255 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; 00256 src += stride; 00257 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; 00258 src += stride; 00259 } 00260 #else 00261 for(i=0; i<8; i++){ 00262 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0; 00263 src += stride; 00264 } 00265 #endif 00266 return 1; 00267 } 00268 00269 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP) 00270 { 00271 #if 1 00272 #if 1 00273 int x; 00274 src+= stride*4; 00275 for(x=0; x<BLOCK_SIZE; x+=4){ 00276 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; 00277 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; 00278 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; 00279 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; 00280 } 00281 #else 00282 int x; 00283 src+= stride*3; 00284 for(x=0; x<BLOCK_SIZE; x++){ 00285 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; 00286 } 00287 #endif 00288 return 1; 00289 #else 00290 int x; 00291 src+= stride*4; 00292 for(x=0; x<BLOCK_SIZE; x++){ 00293 int min=255; 00294 int max=0; 00295 int y; 00296 for(y=0; y<8; y++){ 00297 int v= src[x + y*stride]; 00298 if(v>max) max=v; 00299 if(v<min) min=v; 00300 } 00301 if(max-min > 2*QP) return 0; 00302 } 00303 return 1; 00304 #endif 00305 } 00306 00307 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c) 00308 { 00309 if( isHorizDC_C(src, stride, c) ){ 00310 if( isHorizMinMaxOk_C(src, stride, c->QP) ) 00311 return 1; 00312 else 00313 return 0; 00314 }else{ 00315 return 2; 00316 } 00317 } 00318 00319 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c) 00320 { 00321 if( isVertDC_C(src, stride, c) ){ 00322 if( isVertMinMaxOk_C(src, stride, c->QP) ) 00323 return 1; 00324 else 00325 return 0; 00326 }else{ 00327 return 2; 00328 } 00329 } 00330 00331 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c) 00332 { 00333 int y; 00334 for(y=0; y<BLOCK_SIZE; y++){ 00335 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); 00336 00337 if(FFABS(middleEnergy) < 8*c->QP){ 00338 const int q=(dst[3] - dst[4])/2; 00339 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); 00340 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); 00341 00342 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00343 d= FFMAX(d, 0); 00344 00345 d= (5*d + 32) >> 6; 00346 d*= FFSIGN(-middleEnergy); 00347 00348 if(q>0) 00349 { 00350 d= d<0 ? 0 : d; 00351 d= d>q ? q : d; 00352 } 00353 else 00354 { 00355 d= d>0 ? 0 : d; 00356 d= d<q ? q : d; 00357 } 00358 00359 dst[3]-= d; 00360 dst[4]+= d; 00361 } 00362 dst+= stride; 00363 } 00364 } 00365 00370 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c) 00371 { 00372 int y; 00373 for(y=0; y<BLOCK_SIZE; y++){ 00374 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; 00375 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; 00376 00377 int sums[10]; 00378 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; 00379 sums[1] = sums[0] - first + dst[3]; 00380 sums[2] = sums[1] - first + dst[4]; 00381 sums[3] = sums[2] - first + dst[5]; 00382 sums[4] = sums[3] - first + dst[6]; 00383 sums[5] = sums[4] - dst[0] + dst[7]; 00384 sums[6] = sums[5] - dst[1] + last; 00385 sums[7] = sums[6] - dst[2] + last; 00386 sums[8] = sums[7] - dst[3] + last; 00387 sums[9] = sums[8] - dst[4] + last; 00388 00389 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; 00390 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; 00391 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; 00392 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; 00393 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; 00394 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; 00395 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; 00396 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; 00397 00398 dst+= stride; 00399 } 00400 } 00401 00410 static inline void horizX1Filter(uint8_t *src, int stride, int QP) 00411 { 00412 int y; 00413 static uint64_t *lut= NULL; 00414 if(lut==NULL) 00415 { 00416 int i; 00417 lut = av_malloc(256*8); 00418 for(i=0; i<256; i++) 00419 { 00420 int v= i < 128 ? 2*i : 2*(i-256); 00421 /* 00422 //Simulate 112242211 9-Tap filter 00423 uint64_t a= (v/16) & 0xFF; 00424 uint64_t b= (v/8) & 0xFF; 00425 uint64_t c= (v/4) & 0xFF; 00426 uint64_t d= (3*v/8) & 0xFF; 00427 */ 00428 //Simulate piecewise linear interpolation 00429 uint64_t a= (v/16) & 0xFF; 00430 uint64_t b= (v*3/16) & 0xFF; 00431 uint64_t c= (v*5/16) & 0xFF; 00432 uint64_t d= (7*v/16) & 0xFF; 00433 uint64_t A= (0x100 - a)&0xFF; 00434 uint64_t B= (0x100 - b)&0xFF; 00435 uint64_t C= (0x100 - c)&0xFF; 00436 uint64_t D= (0x100 - c)&0xFF; 00437 00438 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | 00439 (D<<24) | (C<<16) | (B<<8) | (A); 00440 //lut[i] = (v<<32) | (v<<24); 00441 } 00442 } 00443 00444 for(y=0; y<BLOCK_SIZE; y++){ 00445 int a= src[1] - src[2]; 00446 int b= src[3] - src[4]; 00447 int c= src[5] - src[6]; 00448 00449 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); 00450 00451 if(d < QP){ 00452 int v = d * FFSIGN(-b); 00453 00454 src[1] +=v/8; 00455 src[2] +=v/4; 00456 src[3] +=3*v/8; 00457 src[4] -=3*v/8; 00458 src[5] -=v/4; 00459 src[6] -=v/8; 00460 } 00461 src+=stride; 00462 } 00463 } 00464 00468 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){ 00469 int y; 00470 const int QP= c->QP; 00471 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 00472 const int dcThreshold= dcOffset*2 + 1; 00473 //START_TIMER 00474 src+= step*4; // src points to begin of the 8x8 Block 00475 for(y=0; y<8; y++){ 00476 int numEq= 0; 00477 00478 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++; 00479 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++; 00480 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++; 00481 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++; 00482 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++; 00483 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++; 00484 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++; 00485 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++; 00486 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++; 00487 if(numEq > c->ppMode.flatnessThreshold){ 00488 int min, max, x; 00489 00490 if(src[0] > src[step]){ 00491 max= src[0]; 00492 min= src[step]; 00493 }else{ 00494 max= src[step]; 00495 min= src[0]; 00496 } 00497 for(x=2; x<8; x+=2){ 00498 if(src[x*step] > src[(x+1)*step]){ 00499 if(src[x *step] > max) max= src[ x *step]; 00500 if(src[(x+1)*step] < min) min= src[(x+1)*step]; 00501 }else{ 00502 if(src[(x+1)*step] > max) max= src[(x+1)*step]; 00503 if(src[ x *step] < min) min= src[ x *step]; 00504 } 00505 } 00506 if(max-min < 2*QP){ 00507 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; 00508 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; 00509 00510 int sums[10]; 00511 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; 00512 sums[1] = sums[0] - first + src[3*step]; 00513 sums[2] = sums[1] - first + src[4*step]; 00514 sums[3] = sums[2] - first + src[5*step]; 00515 sums[4] = sums[3] - first + src[6*step]; 00516 sums[5] = sums[4] - src[0*step] + src[7*step]; 00517 sums[6] = sums[5] - src[1*step] + last; 00518 sums[7] = sums[6] - src[2*step] + last; 00519 sums[8] = sums[7] - src[3*step] + last; 00520 sums[9] = sums[8] - src[4*step] + last; 00521 00522 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; 00523 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; 00524 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; 00525 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; 00526 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; 00527 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; 00528 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; 00529 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; 00530 } 00531 }else{ 00532 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); 00533 00534 if(FFABS(middleEnergy) < 8*QP){ 00535 const int q=(src[3*step] - src[4*step])/2; 00536 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); 00537 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); 00538 00539 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00540 d= FFMAX(d, 0); 00541 00542 d= (5*d + 32) >> 6; 00543 d*= FFSIGN(-middleEnergy); 00544 00545 if(q>0){ 00546 d= d<0 ? 0 : d; 00547 d= d>q ? q : d; 00548 }else{ 00549 d= d>0 ? 0 : d; 00550 d= d<q ? q : d; 00551 } 00552 00553 src[3*step]-= d; 00554 src[4*step]+= d; 00555 } 00556 } 00557 00558 src += stride; 00559 } 00560 /*if(step==16){ 00561 STOP_TIMER("step16") 00562 }else{ 00563 STOP_TIMER("stepX") 00564 }*/ 00565 } 00566 00567 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one 00568 //Plain C versions 00569 #if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT 00570 #define COMPILE_C 00571 #endif 00572 00573 #if HAVE_ALTIVEC 00574 #define COMPILE_ALTIVEC 00575 #endif //HAVE_ALTIVEC 00576 00577 #if ARCH_X86 00578 00579 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 00580 #define COMPILE_MMX 00581 #endif 00582 00583 #if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT 00584 #define COMPILE_MMX2 00585 #endif 00586 00587 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 00588 #define COMPILE_3DNOW 00589 #endif 00590 #endif /* ARCH_X86 */ 00591 00592 #undef HAVE_MMX 00593 #define HAVE_MMX 0 00594 #undef HAVE_MMX2 00595 #define HAVE_MMX2 0 00596 #undef HAVE_AMD3DNOW 00597 #define HAVE_AMD3DNOW 0 00598 #undef HAVE_ALTIVEC 00599 #define HAVE_ALTIVEC 0 00600 00601 #ifdef COMPILE_C 00602 #define RENAME(a) a ## _C 00603 #include "postprocess_template.c" 00604 #endif 00605 00606 #ifdef COMPILE_ALTIVEC 00607 #undef RENAME 00608 #undef HAVE_ALTIVEC 00609 #define HAVE_ALTIVEC 1 00610 #define RENAME(a) a ## _altivec 00611 #include "postprocess_altivec_template.c" 00612 #include "postprocess_template.c" 00613 #endif 00614 00615 //MMX versions 00616 #ifdef COMPILE_MMX 00617 #undef RENAME 00618 #undef HAVE_MMX 00619 #define HAVE_MMX 1 00620 #define RENAME(a) a ## _MMX 00621 #include "postprocess_template.c" 00622 #endif 00623 00624 //MMX2 versions 00625 #ifdef COMPILE_MMX2 00626 #undef RENAME 00627 #undef HAVE_MMX 00628 #undef HAVE_MMX2 00629 #define HAVE_MMX 1 00630 #define HAVE_MMX2 1 00631 #define RENAME(a) a ## _MMX2 00632 #include "postprocess_template.c" 00633 #endif 00634 00635 //3DNOW versions 00636 #ifdef COMPILE_3DNOW 00637 #undef RENAME 00638 #undef HAVE_MMX 00639 #undef HAVE_MMX2 00640 #undef HAVE_AMD3DNOW 00641 #define HAVE_MMX 1 00642 #define HAVE_MMX2 0 00643 #define HAVE_AMD3DNOW 1 00644 #define RENAME(a) a ## _3DNow 00645 #include "postprocess_template.c" 00646 #endif 00647 00648 // minor note: the HAVE_xyz is messed up after that line so do not use it. 00649 00650 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 00651 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc) 00652 { 00653 PPContext *c= (PPContext *)vc; 00654 PPMode *ppMode= (PPMode *)vm; 00655 c->ppMode= *ppMode; //FIXME 00656 00657 // Using ifs here as they are faster than function pointers although the 00658 // difference would not be measurable here but it is much better because 00659 // someone might exchange the CPU whithout restarting MPlayer ;) 00660 #if CONFIG_RUNTIME_CPUDETECT 00661 #if ARCH_X86 00662 // ordered per speed fastest first 00663 if(c->cpuCaps & PP_CPU_CAPS_MMX2) 00664 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00665 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) 00666 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00667 else if(c->cpuCaps & PP_CPU_CAPS_MMX) 00668 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00669 else 00670 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00671 #else 00672 #if HAVE_ALTIVEC 00673 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC) 00674 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00675 else 00676 #endif 00677 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00678 #endif 00679 #else //CONFIG_RUNTIME_CPUDETECT 00680 #if HAVE_MMX2 00681 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00682 #elif HAVE_AMD3DNOW 00683 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00684 #elif HAVE_MMX 00685 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00686 #elif HAVE_ALTIVEC 00687 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00688 #else 00689 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 00690 #endif 00691 #endif //!CONFIG_RUNTIME_CPUDETECT 00692 } 00693 00694 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 00695 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 00696 00697 /* -pp Command line Help 00698 */ 00699 #if LIBPOSTPROC_VERSION_INT < (52<<16) 00700 const char *const pp_help= 00701 #else 00702 const char pp_help[] = 00703 #endif 00704 "Available postprocessing filters:\n" 00705 "Filters Options\n" 00706 "short long name short long option Description\n" 00707 "* * a autoq CPU power dependent enabler\n" 00708 " c chrom chrominance filtering enabled\n" 00709 " y nochrom chrominance filtering disabled\n" 00710 " n noluma luma filtering disabled\n" 00711 "hb hdeblock (2 threshold) horizontal deblocking filter\n" 00712 " 1. difference factor: default=32, higher -> more deblocking\n" 00713 " 2. flatness threshold: default=39, lower -> more deblocking\n" 00714 " the h & v deblocking filters share these\n" 00715 " so you can't set different thresholds for h / v\n" 00716 "vb vdeblock (2 threshold) vertical deblocking filter\n" 00717 "ha hadeblock (2 threshold) horizontal deblocking filter\n" 00718 "va vadeblock (2 threshold) vertical deblocking filter\n" 00719 "h1 x1hdeblock experimental h deblock filter 1\n" 00720 "v1 x1vdeblock experimental v deblock filter 1\n" 00721 "dr dering deringing filter\n" 00722 "al autolevels automatic brightness / contrast\n" 00723 " f fullyrange stretch luminance to (0..255)\n" 00724 "lb linblenddeint linear blend deinterlacer\n" 00725 "li linipoldeint linear interpolating deinterlace\n" 00726 "ci cubicipoldeint cubic interpolating deinterlacer\n" 00727 "md mediandeint median deinterlacer\n" 00728 "fd ffmpegdeint ffmpeg deinterlacer\n" 00729 "l5 lowpass5 FIR lowpass deinterlacer\n" 00730 "de default hb:a,vb:a,dr:a\n" 00731 "fa fast h1:a,v1:a,dr:a\n" 00732 "ac ha:a:128:7,va:a,dr:a\n" 00733 "tn tmpnoise (3 threshold) temporal noise reducer\n" 00734 " 1. <= 2. <= 3. larger -> stronger filtering\n" 00735 "fq forceQuant <quantizer> force quantizer\n" 00736 "Usage:\n" 00737 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" 00738 "long form example:\n" 00739 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" 00740 "short form example:\n" 00741 "vb:a/hb:a/lb de,-vb\n" 00742 "more examples:\n" 00743 "tn:64:128:256\n" 00744 "\n" 00745 ; 00746 00747 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality) 00748 { 00749 char temp[GET_MODE_BUFFER_SIZE]; 00750 char *p= temp; 00751 static const char filterDelimiters[] = ",/"; 00752 static const char optionDelimiters[] = ":"; 00753 struct PPMode *ppMode; 00754 char *filterToken; 00755 00756 ppMode= av_malloc(sizeof(PPMode)); 00757 00758 ppMode->lumMode= 0; 00759 ppMode->chromMode= 0; 00760 ppMode->maxTmpNoise[0]= 700; 00761 ppMode->maxTmpNoise[1]= 1500; 00762 ppMode->maxTmpNoise[2]= 3000; 00763 ppMode->maxAllowedY= 234; 00764 ppMode->minAllowedY= 16; 00765 ppMode->baseDcDiff= 256/8; 00766 ppMode->flatnessThreshold= 56-16-1; 00767 ppMode->maxClippedThreshold= 0.01; 00768 ppMode->error=0; 00769 00770 memset(temp, 0, GET_MODE_BUFFER_SIZE); 00771 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1); 00772 00773 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name); 00774 00775 for(;;){ 00776 char *filterName; 00777 int q= 1000000; //PP_QUALITY_MAX; 00778 int chrom=-1; 00779 int luma=-1; 00780 char *option; 00781 char *options[OPTIONS_ARRAY_SIZE]; 00782 int i; 00783 int filterNameOk=0; 00784 int numOfUnknownOptions=0; 00785 int enable=1; //does the user want us to enabled or disabled the filter 00786 00787 filterToken= strtok(p, filterDelimiters); 00788 if(filterToken == NULL) break; 00789 p+= strlen(filterToken) + 1; // p points to next filterToken 00790 filterName= strtok(filterToken, optionDelimiters); 00791 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName); 00792 00793 if(*filterName == '-'){ 00794 enable=0; 00795 filterName++; 00796 } 00797 00798 for(;;){ //for all options 00799 option= strtok(NULL, optionDelimiters); 00800 if(option == NULL) break; 00801 00802 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option); 00803 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; 00804 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; 00805 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; 00806 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0; 00807 else{ 00808 options[numOfUnknownOptions] = option; 00809 numOfUnknownOptions++; 00810 } 00811 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; 00812 } 00813 options[numOfUnknownOptions] = NULL; 00814 00815 /* replace stuff from the replace Table */ 00816 for(i=0; replaceTable[2*i]!=NULL; i++){ 00817 if(!strcmp(replaceTable[2*i], filterName)){ 00818 int newlen= strlen(replaceTable[2*i + 1]); 00819 int plen; 00820 int spaceLeft; 00821 00822 if(p==NULL) p= temp, *p=0; //last filter 00823 else p--, *p=','; //not last filter 00824 00825 plen= strlen(p); 00826 spaceLeft= p - temp + plen; 00827 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){ 00828 ppMode->error++; 00829 break; 00830 } 00831 memmove(p + newlen, p, plen+1); 00832 memcpy(p, replaceTable[2*i + 1], newlen); 00833 filterNameOk=1; 00834 } 00835 } 00836 00837 for(i=0; filters[i].shortName!=NULL; i++){ 00838 if( !strcmp(filters[i].longName, filterName) 00839 || !strcmp(filters[i].shortName, filterName)){ 00840 ppMode->lumMode &= ~filters[i].mask; 00841 ppMode->chromMode &= ~filters[i].mask; 00842 00843 filterNameOk=1; 00844 if(!enable) break; // user wants to disable it 00845 00846 if(q >= filters[i].minLumQuality && luma) 00847 ppMode->lumMode|= filters[i].mask; 00848 if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) 00849 if(q >= filters[i].minChromQuality) 00850 ppMode->chromMode|= filters[i].mask; 00851 00852 if(filters[i].mask == LEVEL_FIX){ 00853 int o; 00854 ppMode->minAllowedY= 16; 00855 ppMode->maxAllowedY= 234; 00856 for(o=0; options[o]!=NULL; o++){ 00857 if( !strcmp(options[o],"fullyrange") 00858 ||!strcmp(options[o],"f")){ 00859 ppMode->minAllowedY= 0; 00860 ppMode->maxAllowedY= 255; 00861 numOfUnknownOptions--; 00862 } 00863 } 00864 } 00865 else if(filters[i].mask == TEMP_NOISE_FILTER) 00866 { 00867 int o; 00868 int numOfNoises=0; 00869 00870 for(o=0; options[o]!=NULL; o++){ 00871 char *tail; 00872 ppMode->maxTmpNoise[numOfNoises]= 00873 strtol(options[o], &tail, 0); 00874 if(tail!=options[o]){ 00875 numOfNoises++; 00876 numOfUnknownOptions--; 00877 if(numOfNoises >= 3) break; 00878 } 00879 } 00880 } 00881 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK 00882 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){ 00883 int o; 00884 00885 for(o=0; options[o]!=NULL && o<2; o++){ 00886 char *tail; 00887 int val= strtol(options[o], &tail, 0); 00888 if(tail==options[o]) break; 00889 00890 numOfUnknownOptions--; 00891 if(o==0) ppMode->baseDcDiff= val; 00892 else ppMode->flatnessThreshold= val; 00893 } 00894 } 00895 else if(filters[i].mask == FORCE_QUANT){ 00896 int o; 00897 ppMode->forcedQuant= 15; 00898 00899 for(o=0; options[o]!=NULL && o<1; o++){ 00900 char *tail; 00901 int val= strtol(options[o], &tail, 0); 00902 if(tail==options[o]) break; 00903 00904 numOfUnknownOptions--; 00905 ppMode->forcedQuant= val; 00906 } 00907 } 00908 } 00909 } 00910 if(!filterNameOk) ppMode->error++; 00911 ppMode->error += numOfUnknownOptions; 00912 } 00913 00914 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); 00915 if(ppMode->error){ 00916 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); 00917 av_free(ppMode); 00918 return NULL; 00919 } 00920 return ppMode; 00921 } 00922 00923 void pp_free_mode(pp_mode *mode){ 00924 av_free(mode); 00925 } 00926 00927 static void reallocAlign(void **p, int alignment, int size){ 00928 av_free(*p); 00929 *p= av_mallocz(size); 00930 } 00931 00932 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){ 00933 int mbWidth = (width+15)>>4; 00934 int mbHeight= (height+15)>>4; 00935 int i; 00936 00937 c->stride= stride; 00938 c->qpStride= qpStride; 00939 00940 reallocAlign((void **)&c->tempDst, 8, stride*24); 00941 reallocAlign((void **)&c->tempSrc, 8, stride*24); 00942 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); 00943 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); 00944 for(i=0; i<256; i++) 00945 c->yHistogram[i]= width*height/64*15/256; 00946 00947 for(i=0; i<3; i++){ 00948 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end. 00949 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024); 00950 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size 00951 } 00952 00953 reallocAlign((void **)&c->deintTemp, 8, 2*width+32); 00954 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 00955 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 00956 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); 00957 } 00958 00959 static const char * context_to_name(void * ptr) { 00960 return "postproc"; 00961 } 00962 00963 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL }; 00964 00965 pp_context *pp_get_context(int width, int height, int cpuCaps){ 00966 PPContext *c= av_malloc(sizeof(PPContext)); 00967 int stride= FFALIGN(width, 16); //assumed / will realloc if needed 00968 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed 00969 00970 memset(c, 0, sizeof(PPContext)); 00971 c->av_class = &av_codec_context_class; 00972 c->cpuCaps= cpuCaps; 00973 if(cpuCaps&PP_FORMAT){ 00974 c->hChromaSubSample= cpuCaps&0x3; 00975 c->vChromaSubSample= (cpuCaps>>4)&0x3; 00976 }else{ 00977 c->hChromaSubSample= 1; 00978 c->vChromaSubSample= 1; 00979 } 00980 00981 reallocBuffers(c, width, height, stride, qpStride); 00982 00983 c->frameNum=-1; 00984 00985 return c; 00986 } 00987 00988 void pp_free_context(void *vc){ 00989 PPContext *c = (PPContext*)vc; 00990 int i; 00991 00992 for(i=0; i<3; i++) av_free(c->tempBlurred[i]); 00993 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]); 00994 00995 av_free(c->tempBlocks); 00996 av_free(c->yHistogram); 00997 av_free(c->tempDst); 00998 av_free(c->tempSrc); 00999 av_free(c->deintTemp); 01000 av_free(c->stdQPTable); 01001 av_free(c->nonBQPTable); 01002 av_free(c->forcedQPTable); 01003 01004 memset(c, 0, sizeof(PPContext)); 01005 01006 av_free(c); 01007 } 01008 01009 void pp_postprocess(const uint8_t * src[3], const int srcStride[3], 01010 uint8_t * dst[3], const int dstStride[3], 01011 int width, int height, 01012 const QP_STORE_T *QP_store, int QPStride, 01013 pp_mode *vm, void *vc, int pict_type) 01014 { 01015 int mbWidth = (width+15)>>4; 01016 int mbHeight= (height+15)>>4; 01017 PPMode *mode = (PPMode*)vm; 01018 PPContext *c = (PPContext*)vc; 01019 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0])); 01020 int absQPStride = FFABS(QPStride); 01021 01022 // c->stride and c->QPStride are always positive 01023 if(c->stride < minStride || c->qpStride < absQPStride) 01024 reallocBuffers(c, width, height, 01025 FFMAX(minStride, c->stride), 01026 FFMAX(c->qpStride, absQPStride)); 01027 01028 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){ 01029 int i; 01030 QP_store= c->forcedQPTable; 01031 absQPStride = QPStride = 0; 01032 if(mode->lumMode & FORCE_QUANT) 01033 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant; 01034 else 01035 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1; 01036 } 01037 01038 if(pict_type & PP_PICT_TYPE_QP2){ 01039 int i; 01040 const int count= mbHeight * absQPStride; 01041 for(i=0; i<(count>>2); i++){ 01042 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F; 01043 } 01044 for(i<<=2; i<count; i++){ 01045 c->stdQPTable[i] = QP_store[i]>>1; 01046 } 01047 QP_store= c->stdQPTable; 01048 QPStride= absQPStride; 01049 } 01050 01051 if(0){ 01052 int x,y; 01053 for(y=0; y<mbHeight; y++){ 01054 for(x=0; x<mbWidth; x++){ 01055 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]); 01056 } 01057 av_log(c, AV_LOG_INFO, "\n"); 01058 } 01059 av_log(c, AV_LOG_INFO, "\n"); 01060 } 01061 01062 if((pict_type&7)!=3){ 01063 if (QPStride >= 0){ 01064 int i; 01065 const int count= mbHeight * QPStride; 01066 for(i=0; i<(count>>2); i++){ 01067 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F; 01068 } 01069 for(i<<=2; i<count; i++){ 01070 c->nonBQPTable[i] = QP_store[i] & 0x3F; 01071 } 01072 } else { 01073 int i,j; 01074 for(i=0; i<mbHeight; i++) { 01075 for(j=0; j<absQPStride; j++) { 01076 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F; 01077 } 01078 } 01079 } 01080 } 01081 01082 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n", 01083 mode->lumMode, mode->chromMode); 01084 01085 postProcess(src[0], srcStride[0], dst[0], dstStride[0], 01086 width, height, QP_store, QPStride, 0, mode, c); 01087 01088 width = (width )>>c->hChromaSubSample; 01089 height = (height)>>c->vChromaSubSample; 01090 01091 if(mode->chromMode){ 01092 postProcess(src[1], srcStride[1], dst[1], dstStride[1], 01093 width, height, QP_store, QPStride, 1, mode, c); 01094 postProcess(src[2], srcStride[2], dst[2], dstStride[2], 01095 width, height, QP_store, QPStride, 2, mode, c); 01096 } 01097 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){ 01098 linecpy(dst[1], src[1], height, srcStride[1]); 01099 linecpy(dst[2], src[2], height, srcStride[2]); 01100 }else{ 01101 int y; 01102 for(y=0; y<height; y++){ 01103 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); 01104 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); 01105 } 01106 } 01107 } 01108