libavcodec/h264pred_template.c
Go to the documentation of this file.
00001 /*
00002  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
00003  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
00004  *
00005  * This file is part of Libav.
00006  *
00007  * Libav is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * Libav is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with Libav; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00028 #include "mathops.h"
00029 
00030 #include "bit_depth_template.c"
00031 
00032 static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
00033     pixel *src = (pixel*)_src;
00034     int stride = _stride/sizeof(pixel);
00035     const pixel4 a= AV_RN4PA(src-stride);
00036 
00037     AV_WN4PA(src+0*stride, a);
00038     AV_WN4PA(src+1*stride, a);
00039     AV_WN4PA(src+2*stride, a);
00040     AV_WN4PA(src+3*stride, a);
00041 }
00042 
00043 static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
00044     pixel *src = (pixel*)_src;
00045     int stride = _stride/sizeof(pixel);
00046     AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
00047     AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
00048     AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
00049     AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
00050 }
00051 
00052 static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
00053     pixel *src = (pixel*)_src;
00054     int stride = _stride/sizeof(pixel);
00055     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
00056                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
00057     const pixel4 a = PIXEL_SPLAT_X4(dc);
00058 
00059     AV_WN4PA(src+0*stride, a);
00060     AV_WN4PA(src+1*stride, a);
00061     AV_WN4PA(src+2*stride, a);
00062     AV_WN4PA(src+3*stride, a);
00063 }
00064 
00065 static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
00066     pixel *src = (pixel*)_src;
00067     int stride = _stride/sizeof(pixel);
00068     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
00069     const pixel4 a = PIXEL_SPLAT_X4(dc);
00070 
00071     AV_WN4PA(src+0*stride, a);
00072     AV_WN4PA(src+1*stride, a);
00073     AV_WN4PA(src+2*stride, a);
00074     AV_WN4PA(src+3*stride, a);
00075 }
00076 
00077 static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
00078     pixel *src = (pixel*)_src;
00079     int stride = _stride/sizeof(pixel);
00080     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
00081     const pixel4 a = PIXEL_SPLAT_X4(dc);
00082 
00083     AV_WN4PA(src+0*stride, a);
00084     AV_WN4PA(src+1*stride, a);
00085     AV_WN4PA(src+2*stride, a);
00086     AV_WN4PA(src+3*stride, a);
00087 }
00088 
00089 static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
00090     pixel *src = (pixel*)_src;
00091     int stride = _stride/sizeof(pixel);
00092     const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
00093 
00094     AV_WN4PA(src+0*stride, a);
00095     AV_WN4PA(src+1*stride, a);
00096     AV_WN4PA(src+2*stride, a);
00097     AV_WN4PA(src+3*stride, a);
00098 }
00099 
00100 static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
00101     pixel *src = (pixel*)_src;
00102     int stride = _stride/sizeof(pixel);
00103     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
00104 
00105     AV_WN4PA(src+0*stride, a);
00106     AV_WN4PA(src+1*stride, a);
00107     AV_WN4PA(src+2*stride, a);
00108     AV_WN4PA(src+3*stride, a);
00109 }
00110 
00111 static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
00112     pixel *src = (pixel*)_src;
00113     int stride = _stride/sizeof(pixel);
00114     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
00115 
00116     AV_WN4PA(src+0*stride, a);
00117     AV_WN4PA(src+1*stride, a);
00118     AV_WN4PA(src+2*stride, a);
00119     AV_WN4PA(src+3*stride, a);
00120 }
00121 
00122 
00123 #define LOAD_TOP_RIGHT_EDGE\
00124     const unsigned av_unused t4 = topright[0];\
00125     const unsigned av_unused t5 = topright[1];\
00126     const unsigned av_unused t6 = topright[2];\
00127     const unsigned av_unused t7 = topright[3];\
00128 
00129 #define LOAD_DOWN_LEFT_EDGE\
00130     const unsigned av_unused l4 = src[-1+4*stride];\
00131     const unsigned av_unused l5 = src[-1+5*stride];\
00132     const unsigned av_unused l6 = src[-1+6*stride];\
00133     const unsigned av_unused l7 = src[-1+7*stride];\
00134 
00135 #define LOAD_LEFT_EDGE\
00136     const unsigned av_unused l0 = src[-1+0*stride];\
00137     const unsigned av_unused l1 = src[-1+1*stride];\
00138     const unsigned av_unused l2 = src[-1+2*stride];\
00139     const unsigned av_unused l3 = src[-1+3*stride];\
00140 
00141 #define LOAD_TOP_EDGE\
00142     const unsigned av_unused t0 = src[ 0-1*stride];\
00143     const unsigned av_unused t1 = src[ 1-1*stride];\
00144     const unsigned av_unused t2 = src[ 2-1*stride];\
00145     const unsigned av_unused t3 = src[ 3-1*stride];\
00146 
00147 static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
00148     pixel *src = (pixel*)_src;
00149     int stride = _stride/sizeof(pixel);
00150     const int lt= src[-1-1*stride];
00151     LOAD_TOP_EDGE
00152     LOAD_LEFT_EDGE
00153 
00154     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
00155     src[0+2*stride]=
00156     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
00157     src[0+1*stride]=
00158     src[1+2*stride]=
00159     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
00160     src[0+0*stride]=
00161     src[1+1*stride]=
00162     src[2+2*stride]=
00163     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
00164     src[1+0*stride]=
00165     src[2+1*stride]=
00166     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
00167     src[2+0*stride]=
00168     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
00169     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
00170 }
00171 
00172 static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
00173     pixel *src = (pixel*)_src;
00174     const pixel *topright = (const pixel*)_topright;
00175     int stride = _stride/sizeof(pixel);
00176     LOAD_TOP_EDGE
00177     LOAD_TOP_RIGHT_EDGE
00178 //    LOAD_LEFT_EDGE
00179 
00180     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
00181     src[1+0*stride]=
00182     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
00183     src[2+0*stride]=
00184     src[1+1*stride]=
00185     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
00186     src[3+0*stride]=
00187     src[2+1*stride]=
00188     src[1+2*stride]=
00189     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
00190     src[3+1*stride]=
00191     src[2+2*stride]=
00192     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
00193     src[3+2*stride]=
00194     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
00195     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
00196 }
00197 
00198 static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){
00199     pixel *src = (pixel*)_src;
00200     int stride = _stride/sizeof(pixel);
00201     const int lt= src[-1-1*stride];
00202     LOAD_TOP_EDGE
00203     LOAD_LEFT_EDGE
00204 
00205     src[0+0*stride]=
00206     src[1+2*stride]=(lt + t0 + 1)>>1;
00207     src[1+0*stride]=
00208     src[2+2*stride]=(t0 + t1 + 1)>>1;
00209     src[2+0*stride]=
00210     src[3+2*stride]=(t1 + t2 + 1)>>1;
00211     src[3+0*stride]=(t2 + t3 + 1)>>1;
00212     src[0+1*stride]=
00213     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
00214     src[1+1*stride]=
00215     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
00216     src[2+1*stride]=
00217     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
00218     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
00219     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
00220     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
00221 }
00222 
00223 static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
00224     pixel *src = (pixel*)_src;
00225     const pixel *topright = (const pixel*)_topright;
00226     int stride = _stride/sizeof(pixel);
00227     LOAD_TOP_EDGE
00228     LOAD_TOP_RIGHT_EDGE
00229 
00230     src[0+0*stride]=(t0 + t1 + 1)>>1;
00231     src[1+0*stride]=
00232     src[0+2*stride]=(t1 + t2 + 1)>>1;
00233     src[2+0*stride]=
00234     src[1+2*stride]=(t2 + t3 + 1)>>1;
00235     src[3+0*stride]=
00236     src[2+2*stride]=(t3 + t4+ 1)>>1;
00237     src[3+2*stride]=(t4 + t5+ 1)>>1;
00238     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
00239     src[1+1*stride]=
00240     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
00241     src[2+1*stride]=
00242     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
00243     src[3+1*stride]=
00244     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
00245     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
00246 }
00247 
00248 static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){
00249     pixel *src = (pixel*)_src;
00250     int stride = _stride/sizeof(pixel);
00251     LOAD_LEFT_EDGE
00252 
00253     src[0+0*stride]=(l0 + l1 + 1)>>1;
00254     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
00255     src[2+0*stride]=
00256     src[0+1*stride]=(l1 + l2 + 1)>>1;
00257     src[3+0*stride]=
00258     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
00259     src[2+1*stride]=
00260     src[0+2*stride]=(l2 + l3 + 1)>>1;
00261     src[3+1*stride]=
00262     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
00263     src[3+2*stride]=
00264     src[1+3*stride]=
00265     src[0+3*stride]=
00266     src[2+2*stride]=
00267     src[2+3*stride]=
00268     src[3+3*stride]=l3;
00269 }
00270 
00271 static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){
00272     pixel *src = (pixel*)_src;
00273     int stride = _stride/sizeof(pixel);
00274     const int lt= src[-1-1*stride];
00275     LOAD_TOP_EDGE
00276     LOAD_LEFT_EDGE
00277 
00278     src[0+0*stride]=
00279     src[2+1*stride]=(lt + l0 + 1)>>1;
00280     src[1+0*stride]=
00281     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
00282     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
00283     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
00284     src[0+1*stride]=
00285     src[2+2*stride]=(l0 + l1 + 1)>>1;
00286     src[1+1*stride]=
00287     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
00288     src[0+2*stride]=
00289     src[2+3*stride]=(l1 + l2+ 1)>>1;
00290     src[1+2*stride]=
00291     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
00292     src[0+3*stride]=(l2 + l3 + 1)>>1;
00293     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
00294 }
00295 
00296 static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
00297     int i;
00298     pixel *src = (pixel*)_src;
00299     int stride = _stride/sizeof(pixel);
00300     const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
00301     const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
00302     const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
00303     const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
00304 
00305     for(i=0; i<16; i++){
00306         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
00307         AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
00308         AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
00309         AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
00310     }
00311 }
00312 
00313 static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
00314     int i;
00315     pixel *src = (pixel*)_src;
00316     stride /= sizeof(pixel);
00317 
00318     for(i=0; i<16; i++){
00319         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
00320 
00321         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
00322         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
00323         AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
00324         AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
00325     }
00326 }
00327 
00328 #define PREDICT_16x16_DC(v)\
00329     for(i=0; i<16; i++){\
00330         AV_WN4PA(src+ 0, v);\
00331         AV_WN4PA(src+ 4, v);\
00332         AV_WN4PA(src+ 8, v);\
00333         AV_WN4PA(src+12, v);\
00334         src += stride;\
00335     }
00336 
00337 static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){
00338     int i, dc=0;
00339     pixel *src = (pixel*)_src;
00340     pixel4 dcsplat;
00341     stride /= sizeof(pixel);
00342 
00343     for(i=0;i<16; i++){
00344         dc+= src[-1+i*stride];
00345     }
00346 
00347     for(i=0;i<16; i++){
00348         dc+= src[i-stride];
00349     }
00350 
00351     dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
00352     PREDICT_16x16_DC(dcsplat);
00353 }
00354 
00355 static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){
00356     int i, dc=0;
00357     pixel *src = (pixel*)_src;
00358     pixel4 dcsplat;
00359     stride /= sizeof(pixel);
00360 
00361     for(i=0;i<16; i++){
00362         dc+= src[-1+i*stride];
00363     }
00364 
00365     dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
00366     PREDICT_16x16_DC(dcsplat);
00367 }
00368 
00369 static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){
00370     int i, dc=0;
00371     pixel *src = (pixel*)_src;
00372     pixel4 dcsplat;
00373     stride /= sizeof(pixel);
00374 
00375     for(i=0;i<16; i++){
00376         dc+= src[i-stride];
00377     }
00378 
00379     dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
00380     PREDICT_16x16_DC(dcsplat);
00381 }
00382 
00383 #define PRED16x16_X(n, v) \
00384 static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\
00385     int i;\
00386     pixel *src = (pixel*)_src;\
00387     stride /= sizeof(pixel);\
00388     PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
00389 }
00390 
00391 PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1)
00392 PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0)
00393 PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1)
00394 
00395 static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){
00396   int i, j, k;
00397   int a;
00398   INIT_CLIP
00399   pixel *src = (pixel*)_src;
00400   int stride = _stride/sizeof(pixel);
00401   const pixel * const src0 = src +7-stride;
00402   const pixel *       src1 = src +8*stride-1;
00403   const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
00404   int H = src0[1] - src0[-1];
00405   int V = src1[0] - src2[ 0];
00406   for(k=2; k<=8; ++k) {
00407     src1 += stride; src2 -= stride;
00408     H += k*(src0[k] - src0[-k]);
00409     V += k*(src1[0] - src2[ 0]);
00410   }
00411   if(svq3){
00412     H = ( 5*(H/4) ) / 16;
00413     V = ( 5*(V/4) ) / 16;
00414 
00415     /* required for 100% accuracy */
00416     i = H; H = V; V = i;
00417   }else if(rv40){
00418     H = ( H + (H>>2) ) >> 4;
00419     V = ( V + (V>>2) ) >> 4;
00420   }else{
00421     H = ( 5*H+32 ) >> 6;
00422     V = ( 5*V+32 ) >> 6;
00423   }
00424 
00425   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
00426   for(j=16; j>0; --j) {
00427     int b = a;
00428     a += V;
00429     for(i=-16; i<0; i+=4) {
00430       src[16+i] = CLIP((b    ) >> 5);
00431       src[17+i] = CLIP((b+  H) >> 5);
00432       src[18+i] = CLIP((b+2*H) >> 5);
00433       src[19+i] = CLIP((b+3*H) >> 5);
00434       b += 4*H;
00435     }
00436     src += stride;
00437   }
00438 }
00439 
00440 static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
00441     FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
00442 }
00443 
00444 static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
00445     int i;
00446     pixel *src = (pixel*)_src;
00447     int stride = _stride/sizeof(pixel);
00448     const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
00449     const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
00450 
00451     for(i=0; i<8; i++){
00452         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
00453         AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
00454     }
00455 }
00456 
00457 static void FUNCC(pred8x16_vertical)(uint8_t *_src, int _stride){
00458     int i;
00459     pixel *src = (pixel*)_src;
00460     int stride = _stride>>(sizeof(pixel)-1);
00461     const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
00462     const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
00463 
00464     for(i=0; i<16; i++){
00465         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
00466         AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
00467     }
00468 }
00469 
00470 static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
00471     int i;
00472     pixel *src = (pixel*)_src;
00473     stride /= sizeof(pixel);
00474 
00475     for(i=0; i<8; i++){
00476         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
00477         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
00478         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
00479     }
00480 }
00481 
00482 static void FUNCC(pred8x16_horizontal)(uint8_t *_src, int stride){
00483     int i;
00484     pixel *src = (pixel*)_src;
00485     stride >>= sizeof(pixel)-1;
00486     for(i=0; i<16; i++){
00487         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
00488         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
00489         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
00490     }
00491 }
00492 
00493 #define PRED8x8_X(n, v)\
00494 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
00495     int i;\
00496     const pixel4 a = PIXEL_SPLAT_X4(v);\
00497     pixel *src = (pixel*)_src;\
00498     stride /= sizeof(pixel);\
00499     for(i=0; i<8; i++){\
00500         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
00501         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
00502     }\
00503 }
00504 
00505 PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
00506 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
00507 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
00508 
00509 static void FUNCC(pred8x16_128_dc)(uint8_t *_src, int stride){
00510     FUNCC(pred8x8_128_dc)(_src, stride);
00511     FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
00512 }
00513 
00514 static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
00515     int i;
00516     int dc0, dc2;
00517     pixel4 dc0splat, dc2splat;
00518     pixel *src = (pixel*)_src;
00519     stride /= sizeof(pixel);
00520 
00521     dc0=dc2=0;
00522     for(i=0;i<4; i++){
00523         dc0+= src[-1+i*stride];
00524         dc2+= src[-1+(i+4)*stride];
00525     }
00526     dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
00527     dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
00528 
00529     for(i=0; i<4; i++){
00530         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
00531         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
00532     }
00533     for(i=4; i<8; i++){
00534         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
00535         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
00536     }
00537 }
00538 
00539 static void FUNCC(pred8x16_left_dc)(uint8_t *_src, int stride){
00540     FUNCC(pred8x8_left_dc)(_src, stride);
00541     FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
00542 }
00543 
00544 static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
00545     int i;
00546     int dc0, dc1;
00547     pixel4 dc0splat, dc1splat;
00548     pixel *src = (pixel*)_src;
00549     stride /= sizeof(pixel);
00550 
00551     dc0=dc1=0;
00552     for(i=0;i<4; i++){
00553         dc0+= src[i-stride];
00554         dc1+= src[4+i-stride];
00555     }
00556     dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
00557     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
00558 
00559     for(i=0; i<4; i++){
00560         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
00561         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
00562     }
00563     for(i=4; i<8; i++){
00564         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
00565         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
00566     }
00567 }
00568 
00569 static void FUNCC(pred8x16_top_dc)(uint8_t *_src, int stride){
00570     int i;
00571     int dc0, dc1;
00572     pixel4 dc0splat, dc1splat;
00573     pixel *src = (pixel*)_src;
00574     stride >>= sizeof(pixel)-1;
00575 
00576     dc0=dc1=0;
00577     for(i=0;i<4; i++){
00578         dc0+= src[i-stride];
00579         dc1+= src[4+i-stride];
00580     }
00581     dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
00582     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
00583 
00584     for(i=0; i<16; i++){
00585         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
00586         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
00587     }
00588 }
00589 
00590 static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
00591     int i;
00592     int dc0, dc1, dc2;
00593     pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
00594     pixel *src = (pixel*)_src;
00595     stride /= sizeof(pixel);
00596 
00597     dc0=dc1=dc2=0;
00598     for(i=0;i<4; i++){
00599         dc0+= src[-1+i*stride] + src[i-stride];
00600         dc1+= src[4+i-stride];
00601         dc2+= src[-1+(i+4)*stride];
00602     }
00603     dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
00604     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
00605     dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
00606     dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
00607 
00608     for(i=0; i<4; i++){
00609         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
00610         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
00611     }
00612     for(i=4; i<8; i++){
00613         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
00614         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
00615     }
00616 }
00617 
00618 static void FUNCC(pred8x16_dc)(uint8_t *_src, int stride){
00619     int i;
00620     int dc0, dc1, dc2, dc3, dc4;
00621     pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
00622     pixel *src = (pixel*)_src;
00623     stride >>= sizeof(pixel)-1;
00624 
00625     dc0=dc1=dc2=dc3=dc4=0;
00626     for(i=0;i<4; i++){
00627         dc0+= src[-1+i*stride] + src[i-stride];
00628         dc1+= src[4+i-stride];
00629         dc2+= src[-1+(i+4)*stride];
00630         dc3+= src[-1+(i+8)*stride];
00631         dc4+= src[-1+(i+12)*stride];
00632     }
00633     dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
00634     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
00635     dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
00636     dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
00637     dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
00638     dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
00639     dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
00640     dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
00641 
00642     for(i=0; i<4; i++){
00643         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
00644         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
00645     }
00646     for(i=4; i<8; i++){
00647         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
00648         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
00649     }
00650     for(i=8; i<12; i++){
00651         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
00652         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
00653     }
00654     for(i=12; i<16; i++){
00655         AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
00656         AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
00657     }
00658 }
00659 
00660 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
00661     FUNCC(pred8x8_top_dc)(src, stride);
00662     FUNCC(pred4x4_dc)(src, NULL, stride);
00663 }
00664 
00665 static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){
00666     FUNCC(pred8x16_top_dc)(src, stride);
00667     FUNCC(pred4x4_dc)(src, NULL, stride);
00668 }
00669 
00670 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
00671     FUNCC(pred8x8_dc)(src, stride);
00672     FUNCC(pred4x4_top_dc)(src, NULL, stride);
00673 }
00674 
00675 static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){
00676     FUNCC(pred8x16_dc)(src, stride);
00677     FUNCC(pred4x4_top_dc)(src, NULL, stride);
00678 }
00679 
00680 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
00681     FUNCC(pred8x8_left_dc)(src, stride);
00682     FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
00683     FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
00684 }
00685 
00686 static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){
00687     FUNCC(pred8x16_left_dc)(src, stride);
00688     FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
00689     FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
00690 }
00691 
00692 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
00693     FUNCC(pred8x8_left_dc)(src, stride);
00694     FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
00695     FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
00696 }
00697 
00698 static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){
00699     FUNCC(pred8x16_left_dc)(src, stride);
00700     FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
00701     FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
00702 }
00703 
00704 static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
00705   int j, k;
00706   int a;
00707   INIT_CLIP
00708   pixel *src = (pixel*)_src;
00709   int stride = _stride/sizeof(pixel);
00710   const pixel * const src0 = src +3-stride;
00711   const pixel *       src1 = src +4*stride-1;
00712   const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
00713   int H = src0[1] - src0[-1];
00714   int V = src1[0] - src2[ 0];
00715   for(k=2; k<=4; ++k) {
00716     src1 += stride; src2 -= stride;
00717     H += k*(src0[k] - src0[-k]);
00718     V += k*(src1[0] - src2[ 0]);
00719   }
00720   H = ( 17*H+16 ) >> 5;
00721   V = ( 17*V+16 ) >> 5;
00722 
00723   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
00724   for(j=8; j>0; --j) {
00725     int b = a;
00726     a += V;
00727     src[0] = CLIP((b    ) >> 5);
00728     src[1] = CLIP((b+  H) >> 5);
00729     src[2] = CLIP((b+2*H) >> 5);
00730     src[3] = CLIP((b+3*H) >> 5);
00731     src[4] = CLIP((b+4*H) >> 5);
00732     src[5] = CLIP((b+5*H) >> 5);
00733     src[6] = CLIP((b+6*H) >> 5);
00734     src[7] = CLIP((b+7*H) >> 5);
00735     src += stride;
00736   }
00737 }
00738 
00739 static void FUNCC(pred8x16_plane)(uint8_t *_src, int _stride){
00740   int j, k;
00741   int a;
00742   INIT_CLIP
00743   pixel *src = (pixel*)_src;
00744   int stride = _stride>>(sizeof(pixel)-1);
00745   const pixel * const src0 = src +3-stride;
00746   const pixel *       src1 = src +8*stride-1;
00747   const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
00748   int H = src0[1] - src0[-1];
00749   int V = src1[0] - src2[ 0];
00750 
00751   for (k = 2; k <= 4; ++k) {
00752       src1 += stride; src2 -= stride;
00753       H += k*(src0[k] - src0[-k]);
00754       V += k*(src1[0] - src2[ 0]);
00755   }
00756   for (; k <= 8; ++k) {
00757       src1 += stride; src2 -= stride;
00758       V += k*(src1[0] - src2[0]);
00759   }
00760 
00761   H = (17*H+16) >> 5;
00762   V = (5*V+32) >> 6;
00763 
00764   a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
00765   for(j=16; j>0; --j) {
00766     int b = a;
00767     a += V;
00768     src[0] = CLIP((b    ) >> 5);
00769     src[1] = CLIP((b+  H) >> 5);
00770     src[2] = CLIP((b+2*H) >> 5);
00771     src[3] = CLIP((b+3*H) >> 5);
00772     src[4] = CLIP((b+4*H) >> 5);
00773     src[5] = CLIP((b+5*H) >> 5);
00774     src[6] = CLIP((b+6*H) >> 5);
00775     src[7] = CLIP((b+7*H) >> 5);
00776     src += stride;
00777   }
00778 }
00779 
00780 #define SRC(x,y) src[(x)+(y)*stride]
00781 #define PL(y) \
00782     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
00783 #define PREDICT_8x8_LOAD_LEFT \
00784     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
00785                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
00786     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
00787     const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
00788 
00789 #define PT(x) \
00790     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
00791 #define PREDICT_8x8_LOAD_TOP \
00792     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
00793                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
00794     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
00795     const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
00796                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
00797 
00798 #define PTR(x) \
00799     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
00800 #define PREDICT_8x8_LOAD_TOPRIGHT \
00801     int t8, t9, t10, t11, t12, t13, t14, t15; \
00802     if(has_topright) { \
00803         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
00804         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
00805     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
00806 
00807 #define PREDICT_8x8_LOAD_TOPLEFT \
00808     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
00809 
00810 #define PREDICT_8x8_DC(v) \
00811     int y; \
00812     for( y = 0; y < 8; y++ ) { \
00813         AV_WN4PA(((pixel4*)src)+0, v); \
00814         AV_WN4PA(((pixel4*)src)+1, v); \
00815         src += stride; \
00816     }
00817 
00818 static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00819 {
00820     pixel *src = (pixel*)_src;
00821     int stride = _stride/sizeof(pixel);
00822 
00823     PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
00824 }
00825 static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00826 {
00827     pixel *src = (pixel*)_src;
00828     int stride = _stride/sizeof(pixel);
00829 
00830     PREDICT_8x8_LOAD_LEFT;
00831     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
00832     PREDICT_8x8_DC(dc);
00833 }
00834 static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00835 {
00836     pixel *src = (pixel*)_src;
00837     int stride = _stride/sizeof(pixel);
00838 
00839     PREDICT_8x8_LOAD_TOP;
00840     const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
00841     PREDICT_8x8_DC(dc);
00842 }
00843 static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00844 {
00845     pixel *src = (pixel*)_src;
00846     int stride = _stride/sizeof(pixel);
00847 
00848     PREDICT_8x8_LOAD_LEFT;
00849     PREDICT_8x8_LOAD_TOP;
00850     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
00851                                      +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
00852     PREDICT_8x8_DC(dc);
00853 }
00854 static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00855 {
00856     pixel *src = (pixel*)_src;
00857     int stride = _stride/sizeof(pixel);
00858     pixel4 a;
00859 
00860     PREDICT_8x8_LOAD_LEFT;
00861 #define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
00862                AV_WN4PA(src+y*stride, a); \
00863                AV_WN4PA(src+y*stride+4, a);
00864     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
00865 #undef ROW
00866 }
00867 static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00868 {
00869     int y;
00870     pixel *src = (pixel*)_src;
00871     int stride = _stride/sizeof(pixel);
00872     pixel4 a, b;
00873 
00874     PREDICT_8x8_LOAD_TOP;
00875     src[0] = t0;
00876     src[1] = t1;
00877     src[2] = t2;
00878     src[3] = t3;
00879     src[4] = t4;
00880     src[5] = t5;
00881     src[6] = t6;
00882     src[7] = t7;
00883     a = AV_RN4PA(((pixel4*)src)+0);
00884     b = AV_RN4PA(((pixel4*)src)+1);
00885     for( y = 1; y < 8; y++ ) {
00886         AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
00887         AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
00888     }
00889 }
00890 static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00891 {
00892     pixel *src = (pixel*)_src;
00893     int stride = _stride/sizeof(pixel);
00894     PREDICT_8x8_LOAD_TOP;
00895     PREDICT_8x8_LOAD_TOPRIGHT;
00896     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
00897     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
00898     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
00899     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
00900     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
00901     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
00902     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
00903     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
00904     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
00905     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
00906     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
00907     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
00908     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
00909     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
00910     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
00911 }
00912 static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00913 {
00914     pixel *src = (pixel*)_src;
00915     int stride = _stride/sizeof(pixel);
00916     PREDICT_8x8_LOAD_TOP;
00917     PREDICT_8x8_LOAD_LEFT;
00918     PREDICT_8x8_LOAD_TOPLEFT;
00919     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
00920     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
00921     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
00922     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
00923     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
00924     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
00925     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
00926     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
00927     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
00928     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
00929     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
00930     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
00931     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
00932     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
00933     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
00934 }
00935 static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00936 {
00937     pixel *src = (pixel*)_src;
00938     int stride = _stride/sizeof(pixel);
00939     PREDICT_8x8_LOAD_TOP;
00940     PREDICT_8x8_LOAD_LEFT;
00941     PREDICT_8x8_LOAD_TOPLEFT;
00942     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
00943     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
00944     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
00945     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
00946     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
00947     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
00948     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
00949     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
00950     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
00951     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
00952     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
00953     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
00954     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
00955     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
00956     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
00957     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
00958     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
00959     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
00960     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
00961     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
00962     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
00963     SRC(7,0)= (t6 + t7 + 1) >> 1;
00964 }
00965 static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00966 {
00967     pixel *src = (pixel*)_src;
00968     int stride = _stride/sizeof(pixel);
00969     PREDICT_8x8_LOAD_TOP;
00970     PREDICT_8x8_LOAD_LEFT;
00971     PREDICT_8x8_LOAD_TOPLEFT;
00972     SRC(0,7)= (l6 + l7 + 1) >> 1;
00973     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
00974     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
00975     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
00976     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
00977     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
00978     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
00979     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
00980     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
00981     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
00982     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
00983     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
00984     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
00985     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
00986     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
00987     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
00988     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
00989     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
00990     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
00991     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
00992     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
00993     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
00994 }
00995 static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
00996 {
00997     pixel *src = (pixel*)_src;
00998     int stride = _stride/sizeof(pixel);
00999     PREDICT_8x8_LOAD_TOP;
01000     PREDICT_8x8_LOAD_TOPRIGHT;
01001     SRC(0,0)= (t0 + t1 + 1) >> 1;
01002     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
01003     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
01004     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
01005     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
01006     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
01007     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
01008     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
01009     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
01010     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
01011     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
01012     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
01013     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
01014     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
01015     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
01016     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
01017     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
01018     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
01019     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
01020     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
01021     SRC(7,6)= (t10 + t11 + 1) >> 1;
01022     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
01023 }
01024 static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
01025 {
01026     pixel *src = (pixel*)_src;
01027     int stride = _stride/sizeof(pixel);
01028     PREDICT_8x8_LOAD_LEFT;
01029     SRC(0,0)= (l0 + l1 + 1) >> 1;
01030     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
01031     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
01032     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
01033     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
01034     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
01035     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
01036     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
01037     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
01038     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
01039     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
01040     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
01041     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
01042     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
01043     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
01044     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
01045     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
01046     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
01047 }
01048 #undef PREDICT_8x8_LOAD_LEFT
01049 #undef PREDICT_8x8_LOAD_TOP
01050 #undef PREDICT_8x8_LOAD_TOPLEFT
01051 #undef PREDICT_8x8_LOAD_TOPRIGHT
01052 #undef PREDICT_8x8_DC
01053 #undef PTR
01054 #undef PT
01055 #undef PL
01056 #undef SRC
01057 
01058 static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
01059     int i;
01060     pixel *pix = (pixel*)_pix;
01061     const dctcoef *block = (const dctcoef*)_block;
01062     stride /= sizeof(pixel);
01063     pix -= stride;
01064     for(i=0; i<4; i++){
01065         pixel v = pix[0];
01066         pix[1*stride]= v += block[0];
01067         pix[2*stride]= v += block[4];
01068         pix[3*stride]= v += block[8];
01069         pix[4*stride]= v +  block[12];
01070         pix++;
01071         block++;
01072     }
01073 }
01074 
01075 static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
01076     int i;
01077     pixel *pix = (pixel*)_pix;
01078     const dctcoef *block = (const dctcoef*)_block;
01079     stride /= sizeof(pixel);
01080     for(i=0; i<4; i++){
01081         pixel v = pix[-1];
01082         pix[0]= v += block[0];
01083         pix[1]= v += block[1];
01084         pix[2]= v += block[2];
01085         pix[3]= v +  block[3];
01086         pix+= stride;
01087         block+= 4;
01088     }
01089 }
01090 
01091 static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
01092     int i;
01093     pixel *pix = (pixel*)_pix;
01094     const dctcoef *block = (const dctcoef*)_block;
01095     stride /= sizeof(pixel);
01096     pix -= stride;
01097     for(i=0; i<8; i++){
01098         pixel v = pix[0];
01099         pix[1*stride]= v += block[0];
01100         pix[2*stride]= v += block[8];
01101         pix[3*stride]= v += block[16];
01102         pix[4*stride]= v += block[24];
01103         pix[5*stride]= v += block[32];
01104         pix[6*stride]= v += block[40];
01105         pix[7*stride]= v += block[48];
01106         pix[8*stride]= v +  block[56];
01107         pix++;
01108         block++;
01109     }
01110 }
01111 
01112 static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
01113     int i;
01114     pixel *pix = (pixel*)_pix;
01115     const dctcoef *block = (const dctcoef*)_block;
01116     stride /= sizeof(pixel);
01117     for(i=0; i<8; i++){
01118         pixel v = pix[-1];
01119         pix[0]= v += block[0];
01120         pix[1]= v += block[1];
01121         pix[2]= v += block[2];
01122         pix[3]= v += block[3];
01123         pix[4]= v += block[4];
01124         pix[5]= v += block[5];
01125         pix[6]= v += block[6];
01126         pix[7]= v +  block[7];
01127         pix+= stride;
01128         block+= 8;
01129     }
01130 }
01131 
01132 static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
01133     int i;
01134     for(i=0; i<16; i++)
01135         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
01136 }
01137 
01138 static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
01139     int i;
01140     for(i=0; i<16; i++)
01141         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
01142 }
01143 
01144 static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
01145     int i;
01146     for(i=0; i<4; i++)
01147         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
01148 }
01149 
01150 static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
01151     int i;
01152     for(i=0; i<4; i++)
01153         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
01154     for(i=4; i<8; i++)
01155         FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
01156 }
01157 
01158 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
01159     int i;
01160     for(i=0; i<4; i++)
01161         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
01162 }
01163 
01164 static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
01165     int i;
01166     for(i=0; i<4; i++)
01167         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
01168     for(i=4; i<8; i++)
01169         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
01170 }