Libav 0.7.1
libavcodec/dsputil.c
Go to the documentation of this file.
00001 /*
00002  * DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of Libav.
00009  *
00010  * Libav is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * Libav is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with Libav; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00030 #include "libavutil/imgutils.h"
00031 #include "avcodec.h"
00032 #include "dsputil.h"
00033 #include "simple_idct.h"
00034 #include "faandct.h"
00035 #include "faanidct.h"
00036 #include "mathops.h"
00037 #include "mpegvideo.h"
00038 #include "config.h"
00039 #include "ac3dec.h"
00040 #include "vorbis.h"
00041 #include "png.h"
00042 
00043 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
00044 uint32_t ff_squareTbl[512] = {0, };
00045 
00046 #define BIT_DEPTH 9
00047 #include "dsputil_template.c"
00048 #undef BIT_DEPTH
00049 
00050 #define BIT_DEPTH 10
00051 #include "dsputil_template.c"
00052 #undef BIT_DEPTH
00053 
00054 #define BIT_DEPTH 8
00055 #include "dsputil_template.c"
00056 
00057 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
00058 #define pb_7f (~0UL/255 * 0x7f)
00059 #define pb_80 (~0UL/255 * 0x80)
00060 
00061 const uint8_t ff_zigzag_direct[64] = {
00062     0,   1,  8, 16,  9,  2,  3, 10,
00063     17, 24, 32, 25, 18, 11,  4,  5,
00064     12, 19, 26, 33, 40, 48, 41, 34,
00065     27, 20, 13,  6,  7, 14, 21, 28,
00066     35, 42, 49, 56, 57, 50, 43, 36,
00067     29, 22, 15, 23, 30, 37, 44, 51,
00068     58, 59, 52, 45, 38, 31, 39, 46,
00069     53, 60, 61, 54, 47, 55, 62, 63
00070 };
00071 
00072 /* Specific zigzag scan for 248 idct. NOTE that unlike the
00073    specification, we interleave the fields */
00074 const uint8_t ff_zigzag248_direct[64] = {
00075      0,  8,  1,  9, 16, 24,  2, 10,
00076     17, 25, 32, 40, 48, 56, 33, 41,
00077     18, 26,  3, 11,  4, 12, 19, 27,
00078     34, 42, 49, 57, 50, 58, 35, 43,
00079     20, 28,  5, 13,  6, 14, 21, 29,
00080     36, 44, 51, 59, 52, 60, 37, 45,
00081     22, 30,  7, 15, 23, 31, 38, 46,
00082     53, 61, 54, 62, 39, 47, 55, 63,
00083 };
00084 
00085 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
00086 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
00087 
00088 const uint8_t ff_alternate_horizontal_scan[64] = {
00089     0,  1,   2,  3,  8,  9, 16, 17,
00090     10, 11,  4,  5,  6,  7, 15, 14,
00091     13, 12, 19, 18, 24, 25, 32, 33,
00092     26, 27, 20, 21, 22, 23, 28, 29,
00093     30, 31, 34, 35, 40, 41, 48, 49,
00094     42, 43, 36, 37, 38, 39, 44, 45,
00095     46, 47, 50, 51, 56, 57, 58, 59,
00096     52, 53, 54, 55, 60, 61, 62, 63,
00097 };
00098 
00099 const uint8_t ff_alternate_vertical_scan[64] = {
00100     0,  8,  16, 24,  1,  9,  2, 10,
00101     17, 25, 32, 40, 48, 56, 57, 49,
00102     41, 33, 26, 18,  3, 11,  4, 12,
00103     19, 27, 34, 42, 50, 58, 35, 43,
00104     51, 59, 20, 28,  5, 13,  6, 14,
00105     21, 29, 36, 44, 52, 60, 37, 45,
00106     53, 61, 22, 30,  7, 15, 23, 31,
00107     38, 46, 54, 62, 39, 47, 55, 63,
00108 };
00109 
00110 /* Input permutation for the simple_idct_mmx */
00111 static const uint8_t simple_mmx_permutation[64]={
00112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00120 };
00121 
00122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00123 
00124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
00125     int i;
00126     int end;
00127 
00128     st->scantable= src_scantable;
00129 
00130     for(i=0; i<64; i++){
00131         int j;
00132         j = src_scantable[i];
00133         st->permutated[i] = permutation[j];
00134 #if ARCH_PPC
00135         st->inverse[j] = i;
00136 #endif
00137     }
00138 
00139     end=-1;
00140     for(i=0; i<64; i++){
00141         int j;
00142         j = st->permutated[i];
00143         if(j>end) end=j;
00144         st->raster_end[i]= end;
00145     }
00146 }
00147 
00148 static int pix_sum_c(uint8_t * pix, int line_size)
00149 {
00150     int s, i, j;
00151 
00152     s = 0;
00153     for (i = 0; i < 16; i++) {
00154         for (j = 0; j < 16; j += 8) {
00155             s += pix[0];
00156             s += pix[1];
00157             s += pix[2];
00158             s += pix[3];
00159             s += pix[4];
00160             s += pix[5];
00161             s += pix[6];
00162             s += pix[7];
00163             pix += 8;
00164         }
00165         pix += line_size - 16;
00166     }
00167     return s;
00168 }
00169 
00170 static int pix_norm1_c(uint8_t * pix, int line_size)
00171 {
00172     int s, i, j;
00173     uint32_t *sq = ff_squareTbl + 256;
00174 
00175     s = 0;
00176     for (i = 0; i < 16; i++) {
00177         for (j = 0; j < 16; j += 8) {
00178 #if 0
00179             s += sq[pix[0]];
00180             s += sq[pix[1]];
00181             s += sq[pix[2]];
00182             s += sq[pix[3]];
00183             s += sq[pix[4]];
00184             s += sq[pix[5]];
00185             s += sq[pix[6]];
00186             s += sq[pix[7]];
00187 #else
00188 #if LONG_MAX > 2147483647
00189             register uint64_t x=*(uint64_t*)pix;
00190             s += sq[x&0xff];
00191             s += sq[(x>>8)&0xff];
00192             s += sq[(x>>16)&0xff];
00193             s += sq[(x>>24)&0xff];
00194             s += sq[(x>>32)&0xff];
00195             s += sq[(x>>40)&0xff];
00196             s += sq[(x>>48)&0xff];
00197             s += sq[(x>>56)&0xff];
00198 #else
00199             register uint32_t x=*(uint32_t*)pix;
00200             s += sq[x&0xff];
00201             s += sq[(x>>8)&0xff];
00202             s += sq[(x>>16)&0xff];
00203             s += sq[(x>>24)&0xff];
00204             x=*(uint32_t*)(pix+4);
00205             s += sq[x&0xff];
00206             s += sq[(x>>8)&0xff];
00207             s += sq[(x>>16)&0xff];
00208             s += sq[(x>>24)&0xff];
00209 #endif
00210 #endif
00211             pix += 8;
00212         }
00213         pix += line_size - 16;
00214     }
00215     return s;
00216 }
00217 
00218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
00219     int i;
00220 
00221     for(i=0; i+8<=w; i+=8){
00222         dst[i+0]= av_bswap32(src[i+0]);
00223         dst[i+1]= av_bswap32(src[i+1]);
00224         dst[i+2]= av_bswap32(src[i+2]);
00225         dst[i+3]= av_bswap32(src[i+3]);
00226         dst[i+4]= av_bswap32(src[i+4]);
00227         dst[i+5]= av_bswap32(src[i+5]);
00228         dst[i+6]= av_bswap32(src[i+6]);
00229         dst[i+7]= av_bswap32(src[i+7]);
00230     }
00231     for(;i<w; i++){
00232         dst[i+0]= av_bswap32(src[i+0]);
00233     }
00234 }
00235 
00236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
00237 {
00238     while (len--)
00239         *dst++ = av_bswap16(*src++);
00240 }
00241 
00242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00243 {
00244     int s, i;
00245     uint32_t *sq = ff_squareTbl + 256;
00246 
00247     s = 0;
00248     for (i = 0; i < h; i++) {
00249         s += sq[pix1[0] - pix2[0]];
00250         s += sq[pix1[1] - pix2[1]];
00251         s += sq[pix1[2] - pix2[2]];
00252         s += sq[pix1[3] - pix2[3]];
00253         pix1 += line_size;
00254         pix2 += line_size;
00255     }
00256     return s;
00257 }
00258 
00259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00260 {
00261     int s, i;
00262     uint32_t *sq = ff_squareTbl + 256;
00263 
00264     s = 0;
00265     for (i = 0; i < h; i++) {
00266         s += sq[pix1[0] - pix2[0]];
00267         s += sq[pix1[1] - pix2[1]];
00268         s += sq[pix1[2] - pix2[2]];
00269         s += sq[pix1[3] - pix2[3]];
00270         s += sq[pix1[4] - pix2[4]];
00271         s += sq[pix1[5] - pix2[5]];
00272         s += sq[pix1[6] - pix2[6]];
00273         s += sq[pix1[7] - pix2[7]];
00274         pix1 += line_size;
00275         pix2 += line_size;
00276     }
00277     return s;
00278 }
00279 
00280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00281 {
00282     int s, i;
00283     uint32_t *sq = ff_squareTbl + 256;
00284 
00285     s = 0;
00286     for (i = 0; i < h; i++) {
00287         s += sq[pix1[ 0] - pix2[ 0]];
00288         s += sq[pix1[ 1] - pix2[ 1]];
00289         s += sq[pix1[ 2] - pix2[ 2]];
00290         s += sq[pix1[ 3] - pix2[ 3]];
00291         s += sq[pix1[ 4] - pix2[ 4]];
00292         s += sq[pix1[ 5] - pix2[ 5]];
00293         s += sq[pix1[ 6] - pix2[ 6]];
00294         s += sq[pix1[ 7] - pix2[ 7]];
00295         s += sq[pix1[ 8] - pix2[ 8]];
00296         s += sq[pix1[ 9] - pix2[ 9]];
00297         s += sq[pix1[10] - pix2[10]];
00298         s += sq[pix1[11] - pix2[11]];
00299         s += sq[pix1[12] - pix2[12]];
00300         s += sq[pix1[13] - pix2[13]];
00301         s += sq[pix1[14] - pix2[14]];
00302         s += sq[pix1[15] - pix2[15]];
00303 
00304         pix1 += line_size;
00305         pix2 += line_size;
00306     }
00307     return s;
00308 }
00309 
00310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
00311 {
00312     int i;
00313 
00314     /* read the pixels */
00315     for(i=0;i<8;i++) {
00316         block[0] = pixels[0];
00317         block[1] = pixels[1];
00318         block[2] = pixels[2];
00319         block[3] = pixels[3];
00320         block[4] = pixels[4];
00321         block[5] = pixels[5];
00322         block[6] = pixels[6];
00323         block[7] = pixels[7];
00324         pixels += line_size;
00325         block += 8;
00326     }
00327 }
00328 
00329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
00330                           const uint8_t *s2, int stride){
00331     int i;
00332 
00333     /* read the pixels */
00334     for(i=0;i<8;i++) {
00335         block[0] = s1[0] - s2[0];
00336         block[1] = s1[1] - s2[1];
00337         block[2] = s1[2] - s2[2];
00338         block[3] = s1[3] - s2[3];
00339         block[4] = s1[4] - s2[4];
00340         block[5] = s1[5] - s2[5];
00341         block[6] = s1[6] - s2[6];
00342         block[7] = s1[7] - s2[7];
00343         s1 += stride;
00344         s2 += stride;
00345         block += 8;
00346     }
00347 }
00348 
00349 
00350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00351                              int line_size)
00352 {
00353     int i;
00354     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00355 
00356     /* read the pixels */
00357     for(i=0;i<8;i++) {
00358         pixels[0] = cm[block[0]];
00359         pixels[1] = cm[block[1]];
00360         pixels[2] = cm[block[2]];
00361         pixels[3] = cm[block[3]];
00362         pixels[4] = cm[block[4]];
00363         pixels[5] = cm[block[5]];
00364         pixels[6] = cm[block[6]];
00365         pixels[7] = cm[block[7]];
00366 
00367         pixels += line_size;
00368         block += 8;
00369     }
00370 }
00371 
00372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00373                                  int line_size)
00374 {
00375     int i;
00376     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00377 
00378     /* read the pixels */
00379     for(i=0;i<4;i++) {
00380         pixels[0] = cm[block[0]];
00381         pixels[1] = cm[block[1]];
00382         pixels[2] = cm[block[2]];
00383         pixels[3] = cm[block[3]];
00384 
00385         pixels += line_size;
00386         block += 8;
00387     }
00388 }
00389 
00390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00391                                  int line_size)
00392 {
00393     int i;
00394     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00395 
00396     /* read the pixels */
00397     for(i=0;i<2;i++) {
00398         pixels[0] = cm[block[0]];
00399         pixels[1] = cm[block[1]];
00400 
00401         pixels += line_size;
00402         block += 8;
00403     }
00404 }
00405 
00406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
00407                                     uint8_t *restrict pixels,
00408                                     int line_size)
00409 {
00410     int i, j;
00411 
00412     for (i = 0; i < 8; i++) {
00413         for (j = 0; j < 8; j++) {
00414             if (*block < -128)
00415                 *pixels = 0;
00416             else if (*block > 127)
00417                 *pixels = 255;
00418             else
00419                 *pixels = (uint8_t)(*block + 128);
00420             block++;
00421             pixels++;
00422         }
00423         pixels += (line_size - 8);
00424     }
00425 }
00426 
00427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00428                                     int line_size)
00429 {
00430     int i;
00431 
00432     /* read the pixels */
00433     for(i=0;i<8;i++) {
00434         pixels[0] = block[0];
00435         pixels[1] = block[1];
00436         pixels[2] = block[2];
00437         pixels[3] = block[3];
00438         pixels[4] = block[4];
00439         pixels[5] = block[5];
00440         pixels[6] = block[6];
00441         pixels[7] = block[7];
00442 
00443         pixels += line_size;
00444         block += 8;
00445     }
00446 }
00447 
00448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00449                              int line_size)
00450 {
00451     int i;
00452     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00453 
00454     /* read the pixels */
00455     for(i=0;i<8;i++) {
00456         pixels[0] = cm[pixels[0] + block[0]];
00457         pixels[1] = cm[pixels[1] + block[1]];
00458         pixels[2] = cm[pixels[2] + block[2]];
00459         pixels[3] = cm[pixels[3] + block[3]];
00460         pixels[4] = cm[pixels[4] + block[4]];
00461         pixels[5] = cm[pixels[5] + block[5]];
00462         pixels[6] = cm[pixels[6] + block[6]];
00463         pixels[7] = cm[pixels[7] + block[7]];
00464         pixels += line_size;
00465         block += 8;
00466     }
00467 }
00468 
00469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00470                           int line_size)
00471 {
00472     int i;
00473     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00474 
00475     /* read the pixels */
00476     for(i=0;i<4;i++) {
00477         pixels[0] = cm[pixels[0] + block[0]];
00478         pixels[1] = cm[pixels[1] + block[1]];
00479         pixels[2] = cm[pixels[2] + block[2]];
00480         pixels[3] = cm[pixels[3] + block[3]];
00481         pixels += line_size;
00482         block += 8;
00483     }
00484 }
00485 
00486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00487                           int line_size)
00488 {
00489     int i;
00490     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00491 
00492     /* read the pixels */
00493     for(i=0;i<2;i++) {
00494         pixels[0] = cm[pixels[0] + block[0]];
00495         pixels[1] = cm[pixels[1] + block[1]];
00496         pixels += line_size;
00497         block += 8;
00498     }
00499 }
00500 
00501 static int sum_abs_dctelem_c(DCTELEM *block)
00502 {
00503     int sum=0, i;
00504     for(i=0; i<64; i++)
00505         sum+= FFABS(block[i]);
00506     return sum;
00507 }
00508 
00509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
00510 {
00511     int i;
00512 
00513     for (i = 0; i < h; i++) {
00514         memset(block, value, 16);
00515         block += line_size;
00516     }
00517 }
00518 
00519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
00520 {
00521     int i;
00522 
00523     for (i = 0; i < h; i++) {
00524         memset(block, value, 8);
00525         block += line_size;
00526     }
00527 }
00528 
00529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
00530 {
00531     int i, j;
00532     uint16_t *dst1 = (uint16_t *) dst;
00533     uint16_t *dst2 = (uint16_t *)(dst + linesize);
00534 
00535     for (j = 0; j < 8; j++) {
00536         for (i = 0; i < 8; i++) {
00537             dst1[i] = dst2[i] = src[i] * 0x0101;
00538         }
00539         src  += 8;
00540         dst1 += linesize;
00541         dst2 += linesize;
00542     }
00543 }
00544 
00545 #define avg2(a,b) ((a+b+1)>>1)
00546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
00547 
00548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
00549 {
00550     const int A=(16-x16)*(16-y16);
00551     const int B=(   x16)*(16-y16);
00552     const int C=(16-x16)*(   y16);
00553     const int D=(   x16)*(   y16);
00554     int i;
00555 
00556     for(i=0; i<h; i++)
00557     {
00558         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
00559         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
00560         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
00561         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
00562         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
00563         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
00564         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
00565         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
00566         dst+= stride;
00567         src+= stride;
00568     }
00569 }
00570 
00571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
00572                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
00573 {
00574     int y, vx, vy;
00575     const int s= 1<<shift;
00576 
00577     width--;
00578     height--;
00579 
00580     for(y=0; y<h; y++){
00581         int x;
00582 
00583         vx= ox;
00584         vy= oy;
00585         for(x=0; x<8; x++){ //XXX FIXME optimize
00586             int src_x, src_y, frac_x, frac_y, index;
00587 
00588             src_x= vx>>16;
00589             src_y= vy>>16;
00590             frac_x= src_x&(s-1);
00591             frac_y= src_y&(s-1);
00592             src_x>>=shift;
00593             src_y>>=shift;
00594 
00595             if((unsigned)src_x < width){
00596                 if((unsigned)src_y < height){
00597                     index= src_x + src_y*stride;
00598                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
00599                                            + src[index       +1]*   frac_x )*(s-frac_y)
00600                                         + (  src[index+stride  ]*(s-frac_x)
00601                                            + src[index+stride+1]*   frac_x )*   frac_y
00602                                         + r)>>(shift*2);
00603                 }else{
00604                     index= src_x + av_clip(src_y, 0, height)*stride;
00605                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
00606                                           + src[index       +1]*   frac_x )*s
00607                                         + r)>>(shift*2);
00608                 }
00609             }else{
00610                 if((unsigned)src_y < height){
00611                     index= av_clip(src_x, 0, width) + src_y*stride;
00612                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
00613                                            + src[index+stride  ]*   frac_y )*s
00614                                         + r)>>(shift*2);
00615                 }else{
00616                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
00617                     dst[y*stride + x]=    src[index         ];
00618                 }
00619             }
00620 
00621             vx+= dxx;
00622             vy+= dyx;
00623         }
00624         ox += dxy;
00625         oy += dyy;
00626     }
00627 }
00628 
00629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00630     switch(width){
00631     case 2: put_pixels2_8_c (dst, src, stride, height); break;
00632     case 4: put_pixels4_8_c (dst, src, stride, height); break;
00633     case 8: put_pixels8_8_c (dst, src, stride, height); break;
00634     case 16:put_pixels16_8_c(dst, src, stride, height); break;
00635     }
00636 }
00637 
00638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00639     int i,j;
00640     for (i=0; i < height; i++) {
00641       for (j=0; j < width; j++) {
00642         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
00643       }
00644       src += stride;
00645       dst += stride;
00646     }
00647 }
00648 
00649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00650     int i,j;
00651     for (i=0; i < height; i++) {
00652       for (j=0; j < width; j++) {
00653         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
00654       }
00655       src += stride;
00656       dst += stride;
00657     }
00658 }
00659 
00660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00661     int i,j;
00662     for (i=0; i < height; i++) {
00663       for (j=0; j < width; j++) {
00664         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
00665       }
00666       src += stride;
00667       dst += stride;
00668     }
00669 }
00670 
00671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00672     int i,j;
00673     for (i=0; i < height; i++) {
00674       for (j=0; j < width; j++) {
00675         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
00676       }
00677       src += stride;
00678       dst += stride;
00679     }
00680 }
00681 
00682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00683     int i,j;
00684     for (i=0; i < height; i++) {
00685       for (j=0; j < width; j++) {
00686         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
00687       }
00688       src += stride;
00689       dst += stride;
00690     }
00691 }
00692 
00693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00694     int i,j;
00695     for (i=0; i < height; i++) {
00696       for (j=0; j < width; j++) {
00697         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
00698       }
00699       src += stride;
00700       dst += stride;
00701     }
00702 }
00703 
00704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00705     int i,j;
00706     for (i=0; i < height; i++) {
00707       for (j=0; j < width; j++) {
00708         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
00709       }
00710       src += stride;
00711       dst += stride;
00712     }
00713 }
00714 
00715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00716     int i,j;
00717     for (i=0; i < height; i++) {
00718       for (j=0; j < width; j++) {
00719         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
00720       }
00721       src += stride;
00722       dst += stride;
00723     }
00724 }
00725 
00726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00727     switch(width){
00728     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
00729     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
00730     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
00731     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
00732     }
00733 }
00734 
00735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00736     int i,j;
00737     for (i=0; i < height; i++) {
00738       for (j=0; j < width; j++) {
00739         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
00740       }
00741       src += stride;
00742       dst += stride;
00743     }
00744 }
00745 
00746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00747     int i,j;
00748     for (i=0; i < height; i++) {
00749       for (j=0; j < width; j++) {
00750         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
00751       }
00752       src += stride;
00753       dst += stride;
00754     }
00755 }
00756 
00757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00758     int i,j;
00759     for (i=0; i < height; i++) {
00760       for (j=0; j < width; j++) {
00761         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
00762       }
00763       src += stride;
00764       dst += stride;
00765     }
00766 }
00767 
00768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00769     int i,j;
00770     for (i=0; i < height; i++) {
00771       for (j=0; j < width; j++) {
00772         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00773       }
00774       src += stride;
00775       dst += stride;
00776     }
00777 }
00778 
00779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00780     int i,j;
00781     for (i=0; i < height; i++) {
00782       for (j=0; j < width; j++) {
00783         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00784       }
00785       src += stride;
00786       dst += stride;
00787     }
00788 }
00789 
00790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00791     int i,j;
00792     for (i=0; i < height; i++) {
00793       for (j=0; j < width; j++) {
00794         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
00795       }
00796       src += stride;
00797       dst += stride;
00798     }
00799 }
00800 
00801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00802     int i,j;
00803     for (i=0; i < height; i++) {
00804       for (j=0; j < width; j++) {
00805         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00806       }
00807       src += stride;
00808       dst += stride;
00809     }
00810 }
00811 
00812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00813     int i,j;
00814     for (i=0; i < height; i++) {
00815       for (j=0; j < width; j++) {
00816         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00817       }
00818       src += stride;
00819       dst += stride;
00820     }
00821 }
00822 #if 0
00823 #define TPEL_WIDTH(width)\
00824 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00825     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
00826 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00827     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
00828 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00829     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
00830 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00831     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
00832 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00833     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
00834 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00835     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
00836 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00837     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
00838 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00839     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
00840 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
00841     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
00842 #endif
00843 
00844 #define QPEL_MC(r, OPNAME, RND, OP) \
00845 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00846     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00847     int i;\
00848     for(i=0; i<h; i++)\
00849     {\
00850         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
00851         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
00852         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
00853         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
00854         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
00855         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
00856         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
00857         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
00858         dst+=dstStride;\
00859         src+=srcStride;\
00860     }\
00861 }\
00862 \
00863 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00864     const int w=8;\
00865     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00866     int i;\
00867     for(i=0; i<w; i++)\
00868     {\
00869         const int src0= src[0*srcStride];\
00870         const int src1= src[1*srcStride];\
00871         const int src2= src[2*srcStride];\
00872         const int src3= src[3*srcStride];\
00873         const int src4= src[4*srcStride];\
00874         const int src5= src[5*srcStride];\
00875         const int src6= src[6*srcStride];\
00876         const int src7= src[7*srcStride];\
00877         const int src8= src[8*srcStride];\
00878         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
00879         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
00880         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
00881         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
00882         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
00883         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
00884         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
00885         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
00886         dst++;\
00887         src++;\
00888     }\
00889 }\
00890 \
00891 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00892     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00893     int i;\
00894     \
00895     for(i=0; i<h; i++)\
00896     {\
00897         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
00898         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
00899         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
00900         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
00901         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
00902         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
00903         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
00904         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
00905         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
00906         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
00907         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
00908         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
00909         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
00910         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
00911         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
00912         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
00913         dst+=dstStride;\
00914         src+=srcStride;\
00915     }\
00916 }\
00917 \
00918 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00919     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00920     int i;\
00921     const int w=16;\
00922     for(i=0; i<w; i++)\
00923     {\
00924         const int src0= src[0*srcStride];\
00925         const int src1= src[1*srcStride];\
00926         const int src2= src[2*srcStride];\
00927         const int src3= src[3*srcStride];\
00928         const int src4= src[4*srcStride];\
00929         const int src5= src[5*srcStride];\
00930         const int src6= src[6*srcStride];\
00931         const int src7= src[7*srcStride];\
00932         const int src8= src[8*srcStride];\
00933         const int src9= src[9*srcStride];\
00934         const int src10= src[10*srcStride];\
00935         const int src11= src[11*srcStride];\
00936         const int src12= src[12*srcStride];\
00937         const int src13= src[13*srcStride];\
00938         const int src14= src[14*srcStride];\
00939         const int src15= src[15*srcStride];\
00940         const int src16= src[16*srcStride];\
00941         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
00942         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
00943         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
00944         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
00945         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
00946         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
00947         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
00948         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
00949         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
00950         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
00951         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
00952         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
00953         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
00954         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
00955         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
00956         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
00957         dst++;\
00958         src++;\
00959     }\
00960 }\
00961 \
00962 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
00963     uint8_t half[64];\
00964     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
00965     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
00966 }\
00967 \
00968 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
00969     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
00970 }\
00971 \
00972 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
00973     uint8_t half[64];\
00974     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
00975     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
00976 }\
00977 \
00978 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
00979     uint8_t full[16*9];\
00980     uint8_t half[64];\
00981     copy_block9(full, src, 16, stride, 9);\
00982     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
00983     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
00984 }\
00985 \
00986 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
00987     uint8_t full[16*9];\
00988     copy_block9(full, src, 16, stride, 9);\
00989     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
00990 }\
00991 \
00992 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
00993     uint8_t full[16*9];\
00994     uint8_t half[64];\
00995     copy_block9(full, src, 16, stride, 9);\
00996     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
00997     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
00998 }\
00999 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
01000     uint8_t full[16*9];\
01001     uint8_t halfH[72];\
01002     uint8_t halfV[64];\
01003     uint8_t halfHV[64];\
01004     copy_block9(full, src, 16, stride, 9);\
01005     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01006     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01007     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01008     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01009 }\
01010 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
01011     uint8_t full[16*9];\
01012     uint8_t halfH[72];\
01013     uint8_t halfHV[64];\
01014     copy_block9(full, src, 16, stride, 9);\
01015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01016     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
01017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01018     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
01019 }\
01020 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
01021     uint8_t full[16*9];\
01022     uint8_t halfH[72];\
01023     uint8_t halfV[64];\
01024     uint8_t halfHV[64];\
01025     copy_block9(full, src, 16, stride, 9);\
01026     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01027     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01028     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01029     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01030 }\
01031 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
01032     uint8_t full[16*9];\
01033     uint8_t halfH[72];\
01034     uint8_t halfHV[64];\
01035     copy_block9(full, src, 16, stride, 9);\
01036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01037     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
01038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01039     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
01040 }\
01041 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
01042     uint8_t full[16*9];\
01043     uint8_t halfH[72];\
01044     uint8_t halfV[64];\
01045     uint8_t halfHV[64];\
01046     copy_block9(full, src, 16, stride, 9);\
01047     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01048     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01049     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01050     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01051 }\
01052 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
01053     uint8_t full[16*9];\
01054     uint8_t halfH[72];\
01055     uint8_t halfHV[64];\
01056     copy_block9(full, src, 16, stride, 9);\
01057     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01058     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
01059     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01060     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01061 }\
01062 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
01063     uint8_t full[16*9];\
01064     uint8_t halfH[72];\
01065     uint8_t halfV[64];\
01066     uint8_t halfHV[64];\
01067     copy_block9(full, src, 16, stride, 9);\
01068     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
01069     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01070     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01071     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01072 }\
01073 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
01074     uint8_t full[16*9];\
01075     uint8_t halfH[72];\
01076     uint8_t halfHV[64];\
01077     copy_block9(full, src, 16, stride, 9);\
01078     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01079     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
01080     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01081     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01082 }\
01083 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
01084     uint8_t halfH[72];\
01085     uint8_t halfHV[64];\
01086     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01087     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01088     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
01089 }\
01090 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
01091     uint8_t halfH[72];\
01092     uint8_t halfHV[64];\
01093     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01094     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01095     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01096 }\
01097 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
01098     uint8_t full[16*9];\
01099     uint8_t halfH[72];\
01100     uint8_t halfV[64];\
01101     uint8_t halfHV[64];\
01102     copy_block9(full, src, 16, stride, 9);\
01103     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01104     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01105     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01106     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
01107 }\
01108 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
01109     uint8_t full[16*9];\
01110     uint8_t halfH[72];\
01111     copy_block9(full, src, 16, stride, 9);\
01112     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01113     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
01114     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01115 }\
01116 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
01117     uint8_t full[16*9];\
01118     uint8_t halfH[72];\
01119     uint8_t halfV[64];\
01120     uint8_t halfHV[64];\
01121     copy_block9(full, src, 16, stride, 9);\
01122     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01123     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01124     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01125     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
01126 }\
01127 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
01128     uint8_t full[16*9];\
01129     uint8_t halfH[72];\
01130     copy_block9(full, src, 16, stride, 9);\
01131     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01132     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
01133     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01134 }\
01135 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
01136     uint8_t halfH[72];\
01137     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01138     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01139 }\
01140 \
01141 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
01142     uint8_t half[256];\
01143     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
01144     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
01145 }\
01146 \
01147 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
01148     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
01149 }\
01150 \
01151 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
01152     uint8_t half[256];\
01153     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
01154     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
01155 }\
01156 \
01157 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
01158     uint8_t full[24*17];\
01159     uint8_t half[256];\
01160     copy_block17(full, src, 24, stride, 17);\
01161     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
01162     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
01163 }\
01164 \
01165 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
01166     uint8_t full[24*17];\
01167     copy_block17(full, src, 24, stride, 17);\
01168     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
01169 }\
01170 \
01171 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
01172     uint8_t full[24*17];\
01173     uint8_t half[256];\
01174     copy_block17(full, src, 24, stride, 17);\
01175     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
01176     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
01177 }\
01178 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
01179     uint8_t full[24*17];\
01180     uint8_t halfH[272];\
01181     uint8_t halfV[256];\
01182     uint8_t halfHV[256];\
01183     copy_block17(full, src, 24, stride, 17);\
01184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01185     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
01186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01187     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01188 }\
01189 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
01190     uint8_t full[24*17];\
01191     uint8_t halfH[272];\
01192     uint8_t halfHV[256];\
01193     copy_block17(full, src, 24, stride, 17);\
01194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01195     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
01196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01197     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
01198 }\
01199 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
01200     uint8_t full[24*17];\
01201     uint8_t halfH[272];\
01202     uint8_t halfV[256];\
01203     uint8_t halfHV[256];\
01204     copy_block17(full, src, 24, stride, 17);\
01205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
01207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01208     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01209 }\
01210 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
01211     uint8_t full[24*17];\
01212     uint8_t halfH[272];\
01213     uint8_t halfHV[256];\
01214     copy_block17(full, src, 24, stride, 17);\
01215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01216     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
01217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01218     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
01219 }\
01220 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
01221     uint8_t full[24*17];\
01222     uint8_t halfH[272];\
01223     uint8_t halfV[256];\
01224     uint8_t halfHV[256];\
01225     copy_block17(full, src, 24, stride, 17);\
01226     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01227     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
01228     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01229     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01230 }\
01231 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
01232     uint8_t full[24*17];\
01233     uint8_t halfH[272];\
01234     uint8_t halfHV[256];\
01235     copy_block17(full, src, 24, stride, 17);\
01236     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01237     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
01238     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01239     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
01240 }\
01241 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
01242     uint8_t full[24*17];\
01243     uint8_t halfH[272];\
01244     uint8_t halfV[256];\
01245     uint8_t halfHV[256];\
01246     copy_block17(full, src, 24, stride, 17);\
01247     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
01248     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
01249     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01250     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01251 }\
01252 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
01253     uint8_t full[24*17];\
01254     uint8_t halfH[272];\
01255     uint8_t halfHV[256];\
01256     copy_block17(full, src, 24, stride, 17);\
01257     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01258     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
01259     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01260     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
01261 }\
01262 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
01263     uint8_t halfH[272];\
01264     uint8_t halfHV[256];\
01265     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
01266     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01267     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
01268 }\
01269 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
01270     uint8_t halfH[272];\
01271     uint8_t halfHV[256];\
01272     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
01273     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01274     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
01275 }\
01276 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
01277     uint8_t full[24*17];\
01278     uint8_t halfH[272];\
01279     uint8_t halfV[256];\
01280     uint8_t halfHV[256];\
01281     copy_block17(full, src, 24, stride, 17);\
01282     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01283     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
01284     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01285     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
01286 }\
01287 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
01288     uint8_t full[24*17];\
01289     uint8_t halfH[272];\
01290     copy_block17(full, src, 24, stride, 17);\
01291     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01292     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
01293     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
01294 }\
01295 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
01296     uint8_t full[24*17];\
01297     uint8_t halfH[272];\
01298     uint8_t halfV[256];\
01299     uint8_t halfHV[256];\
01300     copy_block17(full, src, 24, stride, 17);\
01301     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01302     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
01303     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01304     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
01305 }\
01306 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
01307     uint8_t full[24*17];\
01308     uint8_t halfH[272];\
01309     copy_block17(full, src, 24, stride, 17);\
01310     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01311     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
01312     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
01313 }\
01314 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
01315     uint8_t halfH[272];\
01316     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
01317     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
01318 }
01319 
01320 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
01321 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
01322 #define op_put(a, b) a = cm[((b) + 16)>>5]
01323 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
01324 
01325 QPEL_MC(0, put_       , _       , op_put)
01326 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
01327 QPEL_MC(0, avg_       , _       , op_avg)
01328 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
01329 #undef op_avg
01330 #undef op_avg_no_rnd
01331 #undef op_put
01332 #undef op_put_no_rnd
01333 
01334 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
01335 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
01336 #define put_qpel16_mc00_c ff_put_pixels16x16_c
01337 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
01338 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
01339 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
01340 
01341 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
01342     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
01343     int i;
01344 
01345     for(i=0; i<h; i++){
01346         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
01347         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
01348         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
01349         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
01350         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
01351         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
01352         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
01353         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
01354         dst+=dstStride;
01355         src+=srcStride;
01356     }
01357 }
01358 
01359 #if CONFIG_RV40_DECODER
01360 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01361     put_pixels16_xy2_8_c(dst, src, stride, 16);
01362 }
01363 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01364     avg_pixels16_xy2_8_c(dst, src, stride, 16);
01365 }
01366 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01367     put_pixels8_xy2_8_c(dst, src, stride, 8);
01368 }
01369 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01370     avg_pixels8_xy2_8_c(dst, src, stride, 8);
01371 }
01372 #endif /* CONFIG_RV40_DECODER */
01373 
01374 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
01375     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
01376     int i;
01377 
01378     for(i=0; i<w; i++){
01379         const int src_1= src[ -srcStride];
01380         const int src0 = src[0          ];
01381         const int src1 = src[  srcStride];
01382         const int src2 = src[2*srcStride];
01383         const int src3 = src[3*srcStride];
01384         const int src4 = src[4*srcStride];
01385         const int src5 = src[5*srcStride];
01386         const int src6 = src[6*srcStride];
01387         const int src7 = src[7*srcStride];
01388         const int src8 = src[8*srcStride];
01389         const int src9 = src[9*srcStride];
01390         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
01391         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
01392         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
01393         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
01394         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
01395         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
01396         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
01397         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
01398         src++;
01399         dst++;
01400     }
01401 }
01402 
01403 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
01404     uint8_t half[64];
01405     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
01406     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
01407 }
01408 
01409 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
01410     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
01411 }
01412 
01413 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
01414     uint8_t half[64];
01415     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
01416     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
01417 }
01418 
01419 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
01420     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
01421 }
01422 
01423 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
01424     uint8_t halfH[88];
01425     uint8_t halfV[64];
01426     uint8_t halfHV[64];
01427     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
01428     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
01429     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
01430     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
01431 }
01432 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
01433     uint8_t halfH[88];
01434     uint8_t halfV[64];
01435     uint8_t halfHV[64];
01436     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
01437     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
01438     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
01439     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
01440 }
01441 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
01442     uint8_t halfH[88];
01443     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
01444     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
01445 }
01446 
01447 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
01448     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
01449     int x;
01450     const int strength= ff_h263_loop_filter_strength[qscale];
01451 
01452     for(x=0; x<8; x++){
01453         int d1, d2, ad1;
01454         int p0= src[x-2*stride];
01455         int p1= src[x-1*stride];
01456         int p2= src[x+0*stride];
01457         int p3= src[x+1*stride];
01458         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
01459 
01460         if     (d<-2*strength) d1= 0;
01461         else if(d<-  strength) d1=-2*strength - d;
01462         else if(d<   strength) d1= d;
01463         else if(d< 2*strength) d1= 2*strength - d;
01464         else                   d1= 0;
01465 
01466         p1 += d1;
01467         p2 -= d1;
01468         if(p1&256) p1= ~(p1>>31);
01469         if(p2&256) p2= ~(p2>>31);
01470 
01471         src[x-1*stride] = p1;
01472         src[x+0*stride] = p2;
01473 
01474         ad1= FFABS(d1)>>1;
01475 
01476         d2= av_clip((p0-p3)/4, -ad1, ad1);
01477 
01478         src[x-2*stride] = p0 - d2;
01479         src[x+  stride] = p3 + d2;
01480     }
01481     }
01482 }
01483 
01484 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
01485     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
01486     int y;
01487     const int strength= ff_h263_loop_filter_strength[qscale];
01488 
01489     for(y=0; y<8; y++){
01490         int d1, d2, ad1;
01491         int p0= src[y*stride-2];
01492         int p1= src[y*stride-1];
01493         int p2= src[y*stride+0];
01494         int p3= src[y*stride+1];
01495         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
01496 
01497         if     (d<-2*strength) d1= 0;
01498         else if(d<-  strength) d1=-2*strength - d;
01499         else if(d<   strength) d1= d;
01500         else if(d< 2*strength) d1= 2*strength - d;
01501         else                   d1= 0;
01502 
01503         p1 += d1;
01504         p2 -= d1;
01505         if(p1&256) p1= ~(p1>>31);
01506         if(p2&256) p2= ~(p2>>31);
01507 
01508         src[y*stride-1] = p1;
01509         src[y*stride+0] = p2;
01510 
01511         ad1= FFABS(d1)>>1;
01512 
01513         d2= av_clip((p0-p3)/4, -ad1, ad1);
01514 
01515         src[y*stride-2] = p0 - d2;
01516         src[y*stride+1] = p3 + d2;
01517     }
01518     }
01519 }
01520 
01521 static void h261_loop_filter_c(uint8_t *src, int stride){
01522     int x,y,xy,yz;
01523     int temp[64];
01524 
01525     for(x=0; x<8; x++){
01526         temp[x      ] = 4*src[x           ];
01527         temp[x + 7*8] = 4*src[x + 7*stride];
01528     }
01529     for(y=1; y<7; y++){
01530         for(x=0; x<8; x++){
01531             xy = y * stride + x;
01532             yz = y * 8 + x;
01533             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
01534         }
01535     }
01536 
01537     for(y=0; y<8; y++){
01538         src[  y*stride] = (temp[  y*8] + 2)>>2;
01539         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
01540         for(x=1; x<7; x++){
01541             xy = y * stride + x;
01542             yz = y * 8 + x;
01543             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
01544         }
01545     }
01546 }
01547 
01548 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01549 {
01550     int s, i;
01551 
01552     s = 0;
01553     for(i=0;i<h;i++) {
01554         s += abs(pix1[0] - pix2[0]);
01555         s += abs(pix1[1] - pix2[1]);
01556         s += abs(pix1[2] - pix2[2]);
01557         s += abs(pix1[3] - pix2[3]);
01558         s += abs(pix1[4] - pix2[4]);
01559         s += abs(pix1[5] - pix2[5]);
01560         s += abs(pix1[6] - pix2[6]);
01561         s += abs(pix1[7] - pix2[7]);
01562         s += abs(pix1[8] - pix2[8]);
01563         s += abs(pix1[9] - pix2[9]);
01564         s += abs(pix1[10] - pix2[10]);
01565         s += abs(pix1[11] - pix2[11]);
01566         s += abs(pix1[12] - pix2[12]);
01567         s += abs(pix1[13] - pix2[13]);
01568         s += abs(pix1[14] - pix2[14]);
01569         s += abs(pix1[15] - pix2[15]);
01570         pix1 += line_size;
01571         pix2 += line_size;
01572     }
01573     return s;
01574 }
01575 
01576 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01577 {
01578     int s, i;
01579 
01580     s = 0;
01581     for(i=0;i<h;i++) {
01582         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
01583         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
01584         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
01585         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
01586         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
01587         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
01588         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
01589         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
01590         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
01591         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
01592         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
01593         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
01594         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
01595         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
01596         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
01597         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
01598         pix1 += line_size;
01599         pix2 += line_size;
01600     }
01601     return s;
01602 }
01603 
01604 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01605 {
01606     int s, i;
01607     uint8_t *pix3 = pix2 + line_size;
01608 
01609     s = 0;
01610     for(i=0;i<h;i++) {
01611         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
01612         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
01613         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
01614         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
01615         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
01616         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
01617         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
01618         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
01619         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
01620         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
01621         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
01622         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
01623         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
01624         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
01625         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
01626         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
01627         pix1 += line_size;
01628         pix2 += line_size;
01629         pix3 += line_size;
01630     }
01631     return s;
01632 }
01633 
01634 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01635 {
01636     int s, i;
01637     uint8_t *pix3 = pix2 + line_size;
01638 
01639     s = 0;
01640     for(i=0;i<h;i++) {
01641         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
01642         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
01643         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
01644         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
01645         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
01646         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
01647         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
01648         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
01649         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
01650         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
01651         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
01652         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
01653         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
01654         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
01655         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
01656         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
01657         pix1 += line_size;
01658         pix2 += line_size;
01659         pix3 += line_size;
01660     }
01661     return s;
01662 }
01663 
01664 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01665 {
01666     int s, i;
01667 
01668     s = 0;
01669     for(i=0;i<h;i++) {
01670         s += abs(pix1[0] - pix2[0]);
01671         s += abs(pix1[1] - pix2[1]);
01672         s += abs(pix1[2] - pix2[2]);
01673         s += abs(pix1[3] - pix2[3]);
01674         s += abs(pix1[4] - pix2[4]);
01675         s += abs(pix1[5] - pix2[5]);
01676         s += abs(pix1[6] - pix2[6]);
01677         s += abs(pix1[7] - pix2[7]);
01678         pix1 += line_size;
01679         pix2 += line_size;
01680     }
01681     return s;
01682 }
01683 
01684 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01685 {
01686     int s, i;
01687 
01688     s = 0;
01689     for(i=0;i<h;i++) {
01690         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
01691         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
01692         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
01693         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
01694         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
01695         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
01696         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
01697         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
01698         pix1 += line_size;
01699         pix2 += line_size;
01700     }
01701     return s;
01702 }
01703 
01704 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01705 {
01706     int s, i;
01707     uint8_t *pix3 = pix2 + line_size;
01708 
01709     s = 0;
01710     for(i=0;i<h;i++) {
01711         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
01712         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
01713         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
01714         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
01715         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
01716         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
01717         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
01718         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
01719         pix1 += line_size;
01720         pix2 += line_size;
01721         pix3 += line_size;
01722     }
01723     return s;
01724 }
01725 
01726 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01727 {
01728     int s, i;
01729     uint8_t *pix3 = pix2 + line_size;
01730 
01731     s = 0;
01732     for(i=0;i<h;i++) {
01733         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
01734         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
01735         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
01736         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
01737         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
01738         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
01739         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
01740         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
01741         pix1 += line_size;
01742         pix2 += line_size;
01743         pix3 += line_size;
01744     }
01745     return s;
01746 }
01747 
01748 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
01749     MpegEncContext *c = v;
01750     int score1=0;
01751     int score2=0;
01752     int x,y;
01753 
01754     for(y=0; y<h; y++){
01755         for(x=0; x<16; x++){
01756             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
01757         }
01758         if(y+1<h){
01759             for(x=0; x<15; x++){
01760                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
01761                              - s1[x+1] + s1[x+1+stride])
01762                         -FFABS(  s2[x  ] - s2[x  +stride]
01763                              - s2[x+1] + s2[x+1+stride]);
01764             }
01765         }
01766         s1+= stride;
01767         s2+= stride;
01768     }
01769 
01770     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01771     else  return score1 + FFABS(score2)*8;
01772 }
01773 
01774 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
01775     MpegEncContext *c = v;
01776     int score1=0;
01777     int score2=0;
01778     int x,y;
01779 
01780     for(y=0; y<h; y++){
01781         for(x=0; x<8; x++){
01782             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
01783         }
01784         if(y+1<h){
01785             for(x=0; x<7; x++){
01786                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
01787                              - s1[x+1] + s1[x+1+stride])
01788                         -FFABS(  s2[x  ] - s2[x  +stride]
01789                              - s2[x+1] + s2[x+1+stride]);
01790             }
01791         }
01792         s1+= stride;
01793         s2+= stride;
01794     }
01795 
01796     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01797     else  return score1 + FFABS(score2)*8;
01798 }
01799 
01800 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
01801     int i;
01802     unsigned int sum=0;
01803 
01804     for(i=0; i<8*8; i++){
01805         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
01806         int w= weight[i];
01807         b>>= RECON_SHIFT;
01808         assert(-512<b && b<512);
01809 
01810         sum += (w*b)*(w*b)>>4;
01811     }
01812     return sum>>2;
01813 }
01814 
01815 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
01816     int i;
01817 
01818     for(i=0; i<8*8; i++){
01819         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
01820     }
01821 }
01822 
01831 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
01832 {
01833     int i;
01834     DCTELEM temp[64];
01835 
01836     if(last<=0) return;
01837     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
01838 
01839     for(i=0; i<=last; i++){
01840         const int j= scantable[i];
01841         temp[j]= block[j];
01842         block[j]=0;
01843     }
01844 
01845     for(i=0; i<=last; i++){
01846         const int j= scantable[i];
01847         const int perm_j= permutation[j];
01848         block[perm_j]= temp[j];
01849     }
01850 }
01851 
01852 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
01853     return 0;
01854 }
01855 
01856 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
01857     int i;
01858 
01859     memset(cmp, 0, sizeof(void*)*6);
01860 
01861     for(i=0; i<6; i++){
01862         switch(type&0xFF){
01863         case FF_CMP_SAD:
01864             cmp[i]= c->sad[i];
01865             break;
01866         case FF_CMP_SATD:
01867             cmp[i]= c->hadamard8_diff[i];
01868             break;
01869         case FF_CMP_SSE:
01870             cmp[i]= c->sse[i];
01871             break;
01872         case FF_CMP_DCT:
01873             cmp[i]= c->dct_sad[i];
01874             break;
01875         case FF_CMP_DCT264:
01876             cmp[i]= c->dct264_sad[i];
01877             break;
01878         case FF_CMP_DCTMAX:
01879             cmp[i]= c->dct_max[i];
01880             break;
01881         case FF_CMP_PSNR:
01882             cmp[i]= c->quant_psnr[i];
01883             break;
01884         case FF_CMP_BIT:
01885             cmp[i]= c->bit[i];
01886             break;
01887         case FF_CMP_RD:
01888             cmp[i]= c->rd[i];
01889             break;
01890         case FF_CMP_VSAD:
01891             cmp[i]= c->vsad[i];
01892             break;
01893         case FF_CMP_VSSE:
01894             cmp[i]= c->vsse[i];
01895             break;
01896         case FF_CMP_ZERO:
01897             cmp[i]= zero_cmp;
01898             break;
01899         case FF_CMP_NSSE:
01900             cmp[i]= c->nsse[i];
01901             break;
01902 #if CONFIG_DWT
01903         case FF_CMP_W53:
01904             cmp[i]= c->w53[i];
01905             break;
01906         case FF_CMP_W97:
01907             cmp[i]= c->w97[i];
01908             break;
01909 #endif
01910         default:
01911             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
01912         }
01913     }
01914 }
01915 
01916 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
01917     long i;
01918     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
01919         long a = *(long*)(src+i);
01920         long b = *(long*)(dst+i);
01921         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
01922     }
01923     for(; i<w; i++)
01924         dst[i+0] += src[i+0];
01925 }
01926 
01927 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01928     long i;
01929     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
01930         long a = *(long*)(src1+i);
01931         long b = *(long*)(src2+i);
01932         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
01933     }
01934     for(; i<w; i++)
01935         dst[i] = src1[i]+src2[i];
01936 }
01937 
01938 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01939     long i;
01940 #if !HAVE_FAST_UNALIGNED
01941     if((long)src2 & (sizeof(long)-1)){
01942         for(i=0; i+7<w; i+=8){
01943             dst[i+0] = src1[i+0]-src2[i+0];
01944             dst[i+1] = src1[i+1]-src2[i+1];
01945             dst[i+2] = src1[i+2]-src2[i+2];
01946             dst[i+3] = src1[i+3]-src2[i+3];
01947             dst[i+4] = src1[i+4]-src2[i+4];
01948             dst[i+5] = src1[i+5]-src2[i+5];
01949             dst[i+6] = src1[i+6]-src2[i+6];
01950             dst[i+7] = src1[i+7]-src2[i+7];
01951         }
01952     }else
01953 #endif
01954     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
01955         long a = *(long*)(src1+i);
01956         long b = *(long*)(src2+i);
01957         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
01958     }
01959     for(; i<w; i++)
01960         dst[i+0] = src1[i+0]-src2[i+0];
01961 }
01962 
01963 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
01964     int i;
01965     uint8_t l, lt;
01966 
01967     l= *left;
01968     lt= *left_top;
01969 
01970     for(i=0; i<w; i++){
01971         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
01972         lt= src1[i];
01973         dst[i]= l;
01974     }
01975 
01976     *left= l;
01977     *left_top= lt;
01978 }
01979 
01980 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
01981     int i;
01982     uint8_t l, lt;
01983 
01984     l= *left;
01985     lt= *left_top;
01986 
01987     for(i=0; i<w; i++){
01988         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
01989         lt= src1[i];
01990         l= src2[i];
01991         dst[i]= l - pred;
01992     }
01993 
01994     *left= l;
01995     *left_top= lt;
01996 }
01997 
01998 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
01999     int i;
02000 
02001     for(i=0; i<w-1; i++){
02002         acc+= src[i];
02003         dst[i]= acc;
02004         i++;
02005         acc+= src[i];
02006         dst[i]= acc;
02007     }
02008 
02009     for(; i<w; i++){
02010         acc+= src[i];
02011         dst[i]= acc;
02012     }
02013 
02014     return acc;
02015 }
02016 
02017 #if HAVE_BIGENDIAN
02018 #define B 3
02019 #define G 2
02020 #define R 1
02021 #define A 0
02022 #else
02023 #define B 0
02024 #define G 1
02025 #define R 2
02026 #define A 3
02027 #endif
02028 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
02029     int i;
02030     int r,g,b,a;
02031     r= *red;
02032     g= *green;
02033     b= *blue;
02034     a= *alpha;
02035 
02036     for(i=0; i<w; i++){
02037         b+= src[4*i+B];
02038         g+= src[4*i+G];
02039         r+= src[4*i+R];
02040         a+= src[4*i+A];
02041 
02042         dst[4*i+B]= b;
02043         dst[4*i+G]= g;
02044         dst[4*i+R]= r;
02045         dst[4*i+A]= a;
02046     }
02047 
02048     *red= r;
02049     *green= g;
02050     *blue= b;
02051     *alpha= a;
02052 }
02053 #undef B
02054 #undef G
02055 #undef R
02056 #undef A
02057 
02058 #define BUTTERFLY2(o1,o2,i1,i2) \
02059 o1= (i1)+(i2);\
02060 o2= (i1)-(i2);
02061 
02062 #define BUTTERFLY1(x,y) \
02063 {\
02064     int a,b;\
02065     a= x;\
02066     b= y;\
02067     x= a+b;\
02068     y= a-b;\
02069 }
02070 
02071 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
02072 
02073 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
02074     int i;
02075     int temp[64];
02076     int sum=0;
02077 
02078     assert(h==8);
02079 
02080     for(i=0; i<8; i++){
02081         //FIXME try pointer walks
02082         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
02083         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
02084         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
02085         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
02086 
02087         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
02088         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
02089         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
02090         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
02091 
02092         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
02093         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
02094         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
02095         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
02096     }
02097 
02098     for(i=0; i<8; i++){
02099         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
02100         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
02101         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
02102         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
02103 
02104         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
02105         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
02106         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
02107         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
02108 
02109         sum +=
02110              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
02111             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
02112             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
02113             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
02114     }
02115     return sum;
02116 }
02117 
02118 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
02119     int i;
02120     int temp[64];
02121     int sum=0;
02122 
02123     assert(h==8);
02124 
02125     for(i=0; i<8; i++){
02126         //FIXME try pointer walks
02127         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
02128         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
02129         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
02130         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
02131 
02132         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
02133         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
02134         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
02135         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
02136 
02137         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
02138         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
02139         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
02140         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
02141     }
02142 
02143     for(i=0; i<8; i++){
02144         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
02145         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
02146         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
02147         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
02148 
02149         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
02150         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
02151         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
02152         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
02153 
02154         sum +=
02155              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
02156             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
02157             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
02158             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
02159     }
02160 
02161     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
02162 
02163     return sum;
02164 }
02165 
02166 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02167     MpegEncContext * const s= (MpegEncContext *)c;
02168     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02169 
02170     assert(h==8);
02171 
02172     s->dsp.diff_pixels(temp, src1, src2, stride);
02173     s->dsp.fdct(temp);
02174     return s->dsp.sum_abs_dctelem(temp);
02175 }
02176 
02177 #if CONFIG_GPL
02178 #define DCT8_1D {\
02179     const int s07 = SRC(0) + SRC(7);\
02180     const int s16 = SRC(1) + SRC(6);\
02181     const int s25 = SRC(2) + SRC(5);\
02182     const int s34 = SRC(3) + SRC(4);\
02183     const int a0 = s07 + s34;\
02184     const int a1 = s16 + s25;\
02185     const int a2 = s07 - s34;\
02186     const int a3 = s16 - s25;\
02187     const int d07 = SRC(0) - SRC(7);\
02188     const int d16 = SRC(1) - SRC(6);\
02189     const int d25 = SRC(2) - SRC(5);\
02190     const int d34 = SRC(3) - SRC(4);\
02191     const int a4 = d16 + d25 + (d07 + (d07>>1));\
02192     const int a5 = d07 - d34 - (d25 + (d25>>1));\
02193     const int a6 = d07 + d34 - (d16 + (d16>>1));\
02194     const int a7 = d16 - d25 + (d34 + (d34>>1));\
02195     DST(0,  a0 + a1     ) ;\
02196     DST(1,  a4 + (a7>>2)) ;\
02197     DST(2,  a2 + (a3>>1)) ;\
02198     DST(3,  a5 + (a6>>2)) ;\
02199     DST(4,  a0 - a1     ) ;\
02200     DST(5,  a6 - (a5>>2)) ;\
02201     DST(6, (a2>>1) - a3 ) ;\
02202     DST(7, (a4>>2) - a7 ) ;\
02203 }
02204 
02205 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02206     MpegEncContext * const s= (MpegEncContext *)c;
02207     DCTELEM dct[8][8];
02208     int i;
02209     int sum=0;
02210 
02211     s->dsp.diff_pixels(dct[0], src1, src2, stride);
02212 
02213 #define SRC(x) dct[i][x]
02214 #define DST(x,v) dct[i][x]= v
02215     for( i = 0; i < 8; i++ )
02216         DCT8_1D
02217 #undef SRC
02218 #undef DST
02219 
02220 #define SRC(x) dct[x][i]
02221 #define DST(x,v) sum += FFABS(v)
02222     for( i = 0; i < 8; i++ )
02223         DCT8_1D
02224 #undef SRC
02225 #undef DST
02226     return sum;
02227 }
02228 #endif
02229 
02230 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02231     MpegEncContext * const s= (MpegEncContext *)c;
02232     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02233     int sum=0, i;
02234 
02235     assert(h==8);
02236 
02237     s->dsp.diff_pixels(temp, src1, src2, stride);
02238     s->dsp.fdct(temp);
02239 
02240     for(i=0; i<64; i++)
02241         sum= FFMAX(sum, FFABS(temp[i]));
02242 
02243     return sum;
02244 }
02245 
02246 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02247     MpegEncContext * const s= (MpegEncContext *)c;
02248     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
02249     DCTELEM * const bak = temp+64;
02250     int sum=0, i;
02251 
02252     assert(h==8);
02253     s->mb_intra=0;
02254 
02255     s->dsp.diff_pixels(temp, src1, src2, stride);
02256 
02257     memcpy(bak, temp, 64*sizeof(DCTELEM));
02258 
02259     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
02260     s->dct_unquantize_inter(s, temp, 0, s->qscale);
02261     ff_simple_idct(temp); //FIXME
02262 
02263     for(i=0; i<64; i++)
02264         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
02265 
02266     return sum;
02267 }
02268 
02269 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02270     MpegEncContext * const s= (MpegEncContext *)c;
02271     const uint8_t *scantable= s->intra_scantable.permutated;
02272     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02273     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
02274     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
02275     int i, last, run, bits, level, distortion, start_i;
02276     const int esc_length= s->ac_esc_length;
02277     uint8_t * length;
02278     uint8_t * last_length;
02279 
02280     assert(h==8);
02281 
02282     copy_block8(lsrc1, src1, 8, stride, 8);
02283     copy_block8(lsrc2, src2, 8, stride, 8);
02284 
02285     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
02286 
02287     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
02288 
02289     bits=0;
02290 
02291     if (s->mb_intra) {
02292         start_i = 1;
02293         length     = s->intra_ac_vlc_length;
02294         last_length= s->intra_ac_vlc_last_length;
02295         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
02296     } else {
02297         start_i = 0;
02298         length     = s->inter_ac_vlc_length;
02299         last_length= s->inter_ac_vlc_last_length;
02300     }
02301 
02302     if(last>=start_i){
02303         run=0;
02304         for(i=start_i; i<last; i++){
02305             int j= scantable[i];
02306             level= temp[j];
02307 
02308             if(level){
02309                 level+=64;
02310                 if((level&(~127)) == 0){
02311                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
02312                 }else
02313                     bits+= esc_length;
02314                 run=0;
02315             }else
02316                 run++;
02317         }
02318         i= scantable[last];
02319 
02320         level= temp[i] + 64;
02321 
02322         assert(level - 64);
02323 
02324         if((level&(~127)) == 0){
02325             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
02326         }else
02327             bits+= esc_length;
02328 
02329     }
02330 
02331     if(last>=0){
02332         if(s->mb_intra)
02333             s->dct_unquantize_intra(s, temp, 0, s->qscale);
02334         else
02335             s->dct_unquantize_inter(s, temp, 0, s->qscale);
02336     }
02337 
02338     s->dsp.idct_add(lsrc2, 8, temp);
02339 
02340     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
02341 
02342     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
02343 }
02344 
02345 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02346     MpegEncContext * const s= (MpegEncContext *)c;
02347     const uint8_t *scantable= s->intra_scantable.permutated;
02348     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02349     int i, last, run, bits, level, start_i;
02350     const int esc_length= s->ac_esc_length;
02351     uint8_t * length;
02352     uint8_t * last_length;
02353 
02354     assert(h==8);
02355 
02356     s->dsp.diff_pixels(temp, src1, src2, stride);
02357 
02358     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
02359 
02360     bits=0;
02361 
02362     if (s->mb_intra) {
02363         start_i = 1;
02364         length     = s->intra_ac_vlc_length;
02365         last_length= s->intra_ac_vlc_last_length;
02366         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
02367     } else {
02368         start_i = 0;
02369         length     = s->inter_ac_vlc_length;
02370         last_length= s->inter_ac_vlc_last_length;
02371     }
02372 
02373     if(last>=start_i){
02374         run=0;
02375         for(i=start_i; i<last; i++){
02376             int j= scantable[i];
02377             level= temp[j];
02378 
02379             if(level){
02380                 level+=64;
02381                 if((level&(~127)) == 0){
02382                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
02383                 }else
02384                     bits+= esc_length;
02385                 run=0;
02386             }else
02387                 run++;
02388         }
02389         i= scantable[last];
02390 
02391         level= temp[i] + 64;
02392 
02393         assert(level - 64);
02394 
02395         if((level&(~127)) == 0){
02396             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
02397         }else
02398             bits+= esc_length;
02399     }
02400 
02401     return bits;
02402 }
02403 
02404 #define VSAD_INTRA(size) \
02405 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
02406     int score=0;                                                                                            \
02407     int x,y;                                                                                                \
02408                                                                                                             \
02409     for(y=1; y<h; y++){                                                                                     \
02410         for(x=0; x<size; x+=4){                                                                             \
02411             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
02412                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
02413         }                                                                                                   \
02414         s+= stride;                                                                                         \
02415     }                                                                                                       \
02416                                                                                                             \
02417     return score;                                                                                           \
02418 }
02419 VSAD_INTRA(8)
02420 VSAD_INTRA(16)
02421 
02422 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
02423     int score=0;
02424     int x,y;
02425 
02426     for(y=1; y<h; y++){
02427         for(x=0; x<16; x++){
02428             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
02429         }
02430         s1+= stride;
02431         s2+= stride;
02432     }
02433 
02434     return score;
02435 }
02436 
02437 #define SQ(a) ((a)*(a))
02438 #define VSSE_INTRA(size) \
02439 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
02440     int score=0;                                                                                            \
02441     int x,y;                                                                                                \
02442                                                                                                             \
02443     for(y=1; y<h; y++){                                                                                     \
02444         for(x=0; x<size; x+=4){                                                                               \
02445             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
02446                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
02447         }                                                                                                   \
02448         s+= stride;                                                                                         \
02449     }                                                                                                       \
02450                                                                                                             \
02451     return score;                                                                                           \
02452 }
02453 VSSE_INTRA(8)
02454 VSSE_INTRA(16)
02455 
02456 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
02457     int score=0;
02458     int x,y;
02459 
02460     for(y=1; y<h; y++){
02461         for(x=0; x<16; x++){
02462             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
02463         }
02464         s1+= stride;
02465         s2+= stride;
02466     }
02467 
02468     return score;
02469 }
02470 
02471 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
02472                                int size){
02473     int score=0;
02474     int i;
02475     for(i=0; i<size; i++)
02476         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
02477     return score;
02478 }
02479 
02480 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
02481 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
02482 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
02483 #if CONFIG_GPL
02484 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
02485 #endif
02486 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
02487 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
02488 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
02489 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
02490 
02491 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
02492     int i;
02493     for(i=0; i<len; i++)
02494         dst[i] = src0[i] * src1[i];
02495 }
02496 
02497 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
02498     int i;
02499     src1 += len-1;
02500     for(i=0; i<len; i++)
02501         dst[i] = src0[i] * src1[-i];
02502 }
02503 
02504 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
02505     int i;
02506     for(i=0; i<len; i++)
02507         dst[i] = src0[i] * src1[i] + src2[i];
02508 }
02509 
02510 static void vector_fmul_window_c(float *dst, const float *src0,
02511                                  const float *src1, const float *win, int len)
02512 {
02513     int i,j;
02514     dst += len;
02515     win += len;
02516     src0+= len;
02517     for(i=-len, j=len-1; i<0; i++, j--) {
02518         float s0 = src0[i];
02519         float s1 = src1[j];
02520         float wi = win[i];
02521         float wj = win[j];
02522         dst[i] = s0*wj - s1*wi;
02523         dst[j] = s0*wi + s1*wj;
02524     }
02525 }
02526 
02527 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
02528                                  int len)
02529 {
02530     int i;
02531     for (i = 0; i < len; i++)
02532         dst[i] = src[i] * mul;
02533 }
02534 
02535 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
02536                                       const float **sv, float mul, int len)
02537 {
02538     int i;
02539     for (i = 0; i < len; i += 2, sv++) {
02540         dst[i  ] = src[i  ] * sv[0][0] * mul;
02541         dst[i+1] = src[i+1] * sv[0][1] * mul;
02542     }
02543 }
02544 
02545 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
02546                                       const float **sv, float mul, int len)
02547 {
02548     int i;
02549     for (i = 0; i < len; i += 4, sv++) {
02550         dst[i  ] = src[i  ] * sv[0][0] * mul;
02551         dst[i+1] = src[i+1] * sv[0][1] * mul;
02552         dst[i+2] = src[i+2] * sv[0][2] * mul;
02553         dst[i+3] = src[i+3] * sv[0][3] * mul;
02554     }
02555 }
02556 
02557 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
02558                                int len)
02559 {
02560     int i;
02561     for (i = 0; i < len; i += 2, sv++) {
02562         dst[i  ] = sv[0][0] * mul;
02563         dst[i+1] = sv[0][1] * mul;
02564     }
02565 }
02566 
02567 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
02568                                int len)
02569 {
02570     int i;
02571     for (i = 0; i < len; i += 4, sv++) {
02572         dst[i  ] = sv[0][0] * mul;
02573         dst[i+1] = sv[0][1] * mul;
02574         dst[i+2] = sv[0][2] * mul;
02575         dst[i+3] = sv[0][3] * mul;
02576     }
02577 }
02578 
02579 static void butterflies_float_c(float *restrict v1, float *restrict v2,
02580                                 int len)
02581 {
02582     int i;
02583     for (i = 0; i < len; i++) {
02584         float t = v1[i] - v2[i];
02585         v1[i] += v2[i];
02586         v2[i] = t;
02587     }
02588 }
02589 
02590 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
02591 {
02592     float p = 0.0;
02593     int i;
02594 
02595     for (i = 0; i < len; i++)
02596         p += v1[i] * v2[i];
02597 
02598     return p;
02599 }
02600 
02601 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
02602                    uint32_t maxi, uint32_t maxisign)
02603 {
02604 
02605     if(a > mini) return mini;
02606     else if((a^(1U<<31)) > maxisign) return maxi;
02607     else return a;
02608 }
02609 
02610 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
02611     int i;
02612     uint32_t mini = *(uint32_t*)min;
02613     uint32_t maxi = *(uint32_t*)max;
02614     uint32_t maxisign = maxi ^ (1U<<31);
02615     uint32_t *dsti = (uint32_t*)dst;
02616     const uint32_t *srci = (const uint32_t*)src;
02617     for(i=0; i<len; i+=8) {
02618         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
02619         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
02620         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
02621         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
02622         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
02623         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
02624         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
02625         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
02626     }
02627 }
02628 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
02629     int i;
02630     if(min < 0 && max > 0) {
02631         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
02632     } else {
02633         for(i=0; i < len; i+=8) {
02634             dst[i    ] = av_clipf(src[i    ], min, max);
02635             dst[i + 1] = av_clipf(src[i + 1], min, max);
02636             dst[i + 2] = av_clipf(src[i + 2], min, max);
02637             dst[i + 3] = av_clipf(src[i + 3], min, max);
02638             dst[i + 4] = av_clipf(src[i + 4], min, max);
02639             dst[i + 5] = av_clipf(src[i + 5], min, max);
02640             dst[i + 6] = av_clipf(src[i + 6], min, max);
02641             dst[i + 7] = av_clipf(src[i + 7], min, max);
02642         }
02643     }
02644 }
02645 
02646 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
02647 {
02648     int res = 0;
02649 
02650     while (order--)
02651         res += (*v1++ * *v2++) >> shift;
02652 
02653     return res;
02654 }
02655 
02656 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
02657 {
02658     int res = 0;
02659     while (order--) {
02660         res   += *v1 * *v2++;
02661         *v1++ += mul * *v3++;
02662     }
02663     return res;
02664 }
02665 
02666 static void apply_window_int16_c(int16_t *output, const int16_t *input,
02667                                  const int16_t *window, unsigned int len)
02668 {
02669     int i;
02670     int len2 = len >> 1;
02671 
02672     for (i = 0; i < len2; i++) {
02673         int16_t w       = window[i];
02674         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
02675         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
02676     }
02677 }
02678 
02679 #define W0 2048
02680 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
02681 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
02682 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
02683 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
02684 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
02685 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
02686 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
02687 
02688 static void wmv2_idct_row(short * b)
02689 {
02690     int s1,s2;
02691     int a0,a1,a2,a3,a4,a5,a6,a7;
02692     /*step 1*/
02693     a1 = W1*b[1]+W7*b[7];
02694     a7 = W7*b[1]-W1*b[7];
02695     a5 = W5*b[5]+W3*b[3];
02696     a3 = W3*b[5]-W5*b[3];
02697     a2 = W2*b[2]+W6*b[6];
02698     a6 = W6*b[2]-W2*b[6];
02699     a0 = W0*b[0]+W0*b[4];
02700     a4 = W0*b[0]-W0*b[4];
02701     /*step 2*/
02702     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
02703     s2 = (181*(a1-a5-a7+a3)+128)>>8;
02704     /*step 3*/
02705     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
02706     b[1] = (a4+a6 +s1   + (1<<7))>>8;
02707     b[2] = (a4-a6 +s2   + (1<<7))>>8;
02708     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
02709     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
02710     b[5] = (a4-a6 -s2   + (1<<7))>>8;
02711     b[6] = (a4+a6 -s1   + (1<<7))>>8;
02712     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
02713 }
02714 static void wmv2_idct_col(short * b)
02715 {
02716     int s1,s2;
02717     int a0,a1,a2,a3,a4,a5,a6,a7;
02718     /*step 1, with extended precision*/
02719     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
02720     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
02721     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
02722     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
02723     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
02724     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
02725     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
02726     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
02727     /*step 2*/
02728     s1 = (181*(a1-a5+a7-a3)+128)>>8;
02729     s2 = (181*(a1-a5-a7+a3)+128)>>8;
02730     /*step 3*/
02731     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
02732     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
02733     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
02734     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
02735 
02736     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
02737     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
02738     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
02739     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
02740 }
02741 void ff_wmv2_idct_c(short * block){
02742     int i;
02743 
02744     for(i=0;i<64;i+=8){
02745         wmv2_idct_row(block+i);
02746     }
02747     for(i=0;i<8;i++){
02748         wmv2_idct_col(block+i);
02749     }
02750 }
02751 /* XXX: those functions should be suppressed ASAP when all IDCTs are
02752  converted */
02753 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
02754 {
02755     ff_wmv2_idct_c(block);
02756     ff_put_pixels_clamped_c(block, dest, line_size);
02757 }
02758 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
02759 {
02760     ff_wmv2_idct_c(block);
02761     ff_add_pixels_clamped_c(block, dest, line_size);
02762 }
02763 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02764 {
02765     j_rev_dct (block);
02766     ff_put_pixels_clamped_c(block, dest, line_size);
02767 }
02768 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02769 {
02770     j_rev_dct (block);
02771     ff_add_pixels_clamped_c(block, dest, line_size);
02772 }
02773 
02774 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
02775 {
02776     j_rev_dct4 (block);
02777     put_pixels_clamped4_c(block, dest, line_size);
02778 }
02779 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
02780 {
02781     j_rev_dct4 (block);
02782     add_pixels_clamped4_c(block, dest, line_size);
02783 }
02784 
02785 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
02786 {
02787     j_rev_dct2 (block);
02788     put_pixels_clamped2_c(block, dest, line_size);
02789 }
02790 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
02791 {
02792     j_rev_dct2 (block);
02793     add_pixels_clamped2_c(block, dest, line_size);
02794 }
02795 
02796 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
02797 {
02798     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
02799 
02800     dest[0] = cm[(block[0] + 4)>>3];
02801 }
02802 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
02803 {
02804     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
02805 
02806     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
02807 }
02808 
02809 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
02810 
02811 /* init static data */
02812 av_cold void dsputil_static_init(void)
02813 {
02814     int i;
02815 
02816     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
02817     for(i=0;i<MAX_NEG_CROP;i++) {
02818         ff_cropTbl[i] = 0;
02819         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
02820     }
02821 
02822     for(i=0;i<512;i++) {
02823         ff_squareTbl[i] = (i - 256) * (i - 256);
02824     }
02825 
02826     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
02827 }
02828 
02829 int ff_check_alignment(void){
02830     static int did_fail=0;
02831     DECLARE_ALIGNED(16, int, aligned);
02832 
02833     if((intptr_t)&aligned & 15){
02834         if(!did_fail){
02835 #if HAVE_MMX || HAVE_ALTIVEC
02836             av_log(NULL, AV_LOG_ERROR,
02837                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
02838                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
02839                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
02840                 "Do not report crashes to Libav developers.\n");
02841 #endif
02842             did_fail=1;
02843         }
02844         return -1;
02845     }
02846     return 0;
02847 }
02848 
02849 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
02850 {
02851     int i;
02852 
02853     ff_check_alignment();
02854 
02855 #if CONFIG_ENCODERS
02856     if(avctx->dct_algo==FF_DCT_FASTINT) {
02857         c->fdct = fdct_ifast;
02858         c->fdct248 = fdct_ifast248;
02859     }
02860     else if(avctx->dct_algo==FF_DCT_FAAN) {
02861         c->fdct = ff_faandct;
02862         c->fdct248 = ff_faandct248;
02863     }
02864     else {
02865         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
02866         c->fdct248 = ff_fdct248_islow;
02867     }
02868 #endif //CONFIG_ENCODERS
02869 
02870     if(avctx->lowres==1){
02871         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
02872             c->idct_put= ff_jref_idct4_put;
02873             c->idct_add= ff_jref_idct4_add;
02874         }else{
02875             if (avctx->codec_id != CODEC_ID_H264) {
02876                 c->idct_put= ff_h264_lowres_idct_put_8_c;
02877                 c->idct_add= ff_h264_lowres_idct_add_8_c;
02878             } else {
02879                 switch (avctx->bits_per_raw_sample) {
02880                     case 9:
02881                         c->idct_put= ff_h264_lowres_idct_put_9_c;
02882                         c->idct_add= ff_h264_lowres_idct_add_9_c;
02883                         break;
02884                     case 10:
02885                         c->idct_put= ff_h264_lowres_idct_put_10_c;
02886                         c->idct_add= ff_h264_lowres_idct_add_10_c;
02887                         break;
02888                     default:
02889                         c->idct_put= ff_h264_lowres_idct_put_8_c;
02890                         c->idct_add= ff_h264_lowres_idct_add_8_c;
02891                 }
02892             }
02893         }
02894         c->idct    = j_rev_dct4;
02895         c->idct_permutation_type= FF_NO_IDCT_PERM;
02896     }else if(avctx->lowres==2){
02897         c->idct_put= ff_jref_idct2_put;
02898         c->idct_add= ff_jref_idct2_add;
02899         c->idct    = j_rev_dct2;
02900         c->idct_permutation_type= FF_NO_IDCT_PERM;
02901     }else if(avctx->lowres==3){
02902         c->idct_put= ff_jref_idct1_put;
02903         c->idct_add= ff_jref_idct1_add;
02904         c->idct    = j_rev_dct1;
02905         c->idct_permutation_type= FF_NO_IDCT_PERM;
02906     }else{
02907         if(avctx->idct_algo==FF_IDCT_INT){
02908             c->idct_put= ff_jref_idct_put;
02909             c->idct_add= ff_jref_idct_add;
02910             c->idct    = j_rev_dct;
02911             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
02912         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
02913                 avctx->idct_algo==FF_IDCT_VP3){
02914             c->idct_put= ff_vp3_idct_put_c;
02915             c->idct_add= ff_vp3_idct_add_c;
02916             c->idct    = ff_vp3_idct_c;
02917             c->idct_permutation_type= FF_NO_IDCT_PERM;
02918         }else if(avctx->idct_algo==FF_IDCT_WMV2){
02919             c->idct_put= ff_wmv2_idct_put_c;
02920             c->idct_add= ff_wmv2_idct_add_c;
02921             c->idct    = ff_wmv2_idct_c;
02922             c->idct_permutation_type= FF_NO_IDCT_PERM;
02923         }else if(avctx->idct_algo==FF_IDCT_FAAN){
02924             c->idct_put= ff_faanidct_put;
02925             c->idct_add= ff_faanidct_add;
02926             c->idct    = ff_faanidct;
02927             c->idct_permutation_type= FF_NO_IDCT_PERM;
02928         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
02929             c->idct_put= ff_ea_idct_put_c;
02930             c->idct_permutation_type= FF_NO_IDCT_PERM;
02931         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
02932             c->idct     = ff_bink_idct_c;
02933             c->idct_add = ff_bink_idct_add_c;
02934             c->idct_put = ff_bink_idct_put_c;
02935             c->idct_permutation_type = FF_NO_IDCT_PERM;
02936         }else{ //accurate/default
02937             c->idct_put= ff_simple_idct_put;
02938             c->idct_add= ff_simple_idct_add;
02939             c->idct    = ff_simple_idct;
02940             c->idct_permutation_type= FF_NO_IDCT_PERM;
02941         }
02942     }
02943 
02944     c->get_pixels = get_pixels_c;
02945     c->diff_pixels = diff_pixels_c;
02946     c->put_pixels_clamped = ff_put_pixels_clamped_c;
02947     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
02948     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
02949     c->add_pixels_clamped = ff_add_pixels_clamped_c;
02950     c->sum_abs_dctelem = sum_abs_dctelem_c;
02951     c->gmc1 = gmc1_c;
02952     c->gmc = ff_gmc_c;
02953     c->pix_sum = pix_sum_c;
02954     c->pix_norm1 = pix_norm1_c;
02955 
02956     c->fill_block_tab[0] = fill_block16_c;
02957     c->fill_block_tab[1] = fill_block8_c;
02958     c->scale_block = scale_block_c;
02959 
02960     /* TODO [0] 16  [1] 8 */
02961     c->pix_abs[0][0] = pix_abs16_c;
02962     c->pix_abs[0][1] = pix_abs16_x2_c;
02963     c->pix_abs[0][2] = pix_abs16_y2_c;
02964     c->pix_abs[0][3] = pix_abs16_xy2_c;
02965     c->pix_abs[1][0] = pix_abs8_c;
02966     c->pix_abs[1][1] = pix_abs8_x2_c;
02967     c->pix_abs[1][2] = pix_abs8_y2_c;
02968     c->pix_abs[1][3] = pix_abs8_xy2_c;
02969 
02970     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
02971     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
02972     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
02973     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
02974     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
02975     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
02976     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
02977     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
02978     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
02979 
02980     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
02981     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
02982     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
02983     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
02984     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
02985     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
02986     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
02987     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
02988     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
02989 
02990 #define dspfunc(PFX, IDX, NUM) \
02991     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
02992     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
02993     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
02994     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
02995     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
02996     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
02997     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
02998     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
02999     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
03000     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
03001     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
03002     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
03003     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
03004     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
03005     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
03006     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
03007 
03008     dspfunc(put_qpel, 0, 16);
03009     dspfunc(put_no_rnd_qpel, 0, 16);
03010 
03011     dspfunc(avg_qpel, 0, 16);
03012     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
03013 
03014     dspfunc(put_qpel, 1, 8);
03015     dspfunc(put_no_rnd_qpel, 1, 8);
03016 
03017     dspfunc(avg_qpel, 1, 8);
03018     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
03019 
03020 #undef dspfunc
03021 
03022 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
03023     ff_mlp_init(c, avctx);
03024 #endif
03025 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
03026     ff_intrax8dsp_init(c,avctx);
03027 #endif
03028 #if CONFIG_RV30_DECODER
03029     ff_rv30dsp_init(c,avctx);
03030 #endif
03031 #if CONFIG_RV40_DECODER
03032     ff_rv40dsp_init(c,avctx);
03033     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
03034     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
03035     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
03036     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
03037 #endif
03038 
03039     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
03040     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
03041     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
03042     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
03043     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
03044     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
03045     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
03046     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
03047 
03048 #define SET_CMP_FUNC(name) \
03049     c->name[0]= name ## 16_c;\
03050     c->name[1]= name ## 8x8_c;
03051 
03052     SET_CMP_FUNC(hadamard8_diff)
03053     c->hadamard8_diff[4]= hadamard8_intra16_c;
03054     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
03055     SET_CMP_FUNC(dct_sad)
03056     SET_CMP_FUNC(dct_max)
03057 #if CONFIG_GPL
03058     SET_CMP_FUNC(dct264_sad)
03059 #endif
03060     c->sad[0]= pix_abs16_c;
03061     c->sad[1]= pix_abs8_c;
03062     c->sse[0]= sse16_c;
03063     c->sse[1]= sse8_c;
03064     c->sse[2]= sse4_c;
03065     SET_CMP_FUNC(quant_psnr)
03066     SET_CMP_FUNC(rd)
03067     SET_CMP_FUNC(bit)
03068     c->vsad[0]= vsad16_c;
03069     c->vsad[4]= vsad_intra16_c;
03070     c->vsad[5]= vsad_intra8_c;
03071     c->vsse[0]= vsse16_c;
03072     c->vsse[4]= vsse_intra16_c;
03073     c->vsse[5]= vsse_intra8_c;
03074     c->nsse[0]= nsse16_c;
03075     c->nsse[1]= nsse8_c;
03076 #if CONFIG_DWT
03077     ff_dsputil_init_dwt(c);
03078 #endif
03079 
03080     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
03081 
03082     c->add_bytes= add_bytes_c;
03083     c->add_bytes_l2= add_bytes_l2_c;
03084     c->diff_bytes= diff_bytes_c;
03085     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
03086     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
03087     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
03088     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
03089     c->bswap_buf= bswap_buf;
03090     c->bswap16_buf = bswap16_buf;
03091 #if CONFIG_PNG_DECODER
03092     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
03093 #endif
03094 
03095     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
03096         c->h263_h_loop_filter= h263_h_loop_filter_c;
03097         c->h263_v_loop_filter= h263_v_loop_filter_c;
03098     }
03099 
03100     if (CONFIG_VP3_DECODER) {
03101         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
03102         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
03103         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
03104     }
03105 
03106     c->h261_loop_filter= h261_loop_filter_c;
03107 
03108     c->try_8x8basis= try_8x8basis_c;
03109     c->add_8x8basis= add_8x8basis_c;
03110 
03111 #if CONFIG_VORBIS_DECODER
03112     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
03113 #endif
03114 #if CONFIG_AC3_DECODER
03115     c->ac3_downmix = ff_ac3_downmix_c;
03116 #endif
03117     c->vector_fmul = vector_fmul_c;
03118     c->vector_fmul_reverse = vector_fmul_reverse_c;
03119     c->vector_fmul_add = vector_fmul_add_c;
03120     c->vector_fmul_window = vector_fmul_window_c;
03121     c->vector_clipf = vector_clipf_c;
03122     c->scalarproduct_int16 = scalarproduct_int16_c;
03123     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
03124     c->apply_window_int16 = apply_window_int16_c;
03125     c->scalarproduct_float = scalarproduct_float_c;
03126     c->butterflies_float = butterflies_float_c;
03127     c->vector_fmul_scalar = vector_fmul_scalar_c;
03128 
03129     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
03130     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
03131 
03132     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
03133     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
03134 
03135     c->shrink[0]= av_image_copy_plane;
03136     c->shrink[1]= ff_shrink22;
03137     c->shrink[2]= ff_shrink44;
03138     c->shrink[3]= ff_shrink88;
03139 
03140     c->prefetch= just_return;
03141 
03142     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
03143     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
03144 
03145 #undef FUNC
03146 #undef FUNCC
03147 #define FUNC(f, depth) f ## _ ## depth
03148 #define FUNCC(f, depth) f ## _ ## depth ## _c
03149 
03150 #define dspfunc1(PFX, IDX, NUM, depth)\
03151     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
03152     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
03153     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
03154     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
03155 
03156 #define dspfunc2(PFX, IDX, NUM, depth)\
03157     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
03158     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
03159     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
03160     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
03161     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
03162     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
03163     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
03164     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
03165     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
03166     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
03167     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
03168     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
03169     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
03170     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
03171     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
03172     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
03173 
03174 
03175 #define BIT_DEPTH_FUNCS(depth)\
03176     c->draw_edges                    = FUNCC(draw_edges            , depth);\
03177     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
03178     c->clear_block                   = FUNCC(clear_block           , depth);\
03179     c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
03180     c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
03181     c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
03182     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
03183     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
03184 \
03185     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
03186     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
03187     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
03188     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
03189     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
03190     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
03191 \
03192     dspfunc1(put       , 0, 16, depth);\
03193     dspfunc1(put       , 1,  8, depth);\
03194     dspfunc1(put       , 2,  4, depth);\
03195     dspfunc1(put       , 3,  2, depth);\
03196     dspfunc1(put_no_rnd, 0, 16, depth);\
03197     dspfunc1(put_no_rnd, 1,  8, depth);\
03198     dspfunc1(avg       , 0, 16, depth);\
03199     dspfunc1(avg       , 1,  8, depth);\
03200     dspfunc1(avg       , 2,  4, depth);\
03201     dspfunc1(avg       , 3,  2, depth);\
03202     dspfunc1(avg_no_rnd, 0, 16, depth);\
03203     dspfunc1(avg_no_rnd, 1,  8, depth);\
03204 \
03205     dspfunc2(put_h264_qpel, 0, 16, depth);\
03206     dspfunc2(put_h264_qpel, 1,  8, depth);\
03207     dspfunc2(put_h264_qpel, 2,  4, depth);\
03208     dspfunc2(put_h264_qpel, 3,  2, depth);\
03209     dspfunc2(avg_h264_qpel, 0, 16, depth);\
03210     dspfunc2(avg_h264_qpel, 1,  8, depth);\
03211     dspfunc2(avg_h264_qpel, 2,  4, depth);
03212 
03213     if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
03214         BIT_DEPTH_FUNCS(8)
03215     } else {
03216         switch (avctx->bits_per_raw_sample) {
03217             case 9:
03218                 BIT_DEPTH_FUNCS(9)
03219                 break;
03220             case 10:
03221                 BIT_DEPTH_FUNCS(10)
03222                 break;
03223             default:
03224                 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
03225                 BIT_DEPTH_FUNCS(8)
03226                 break;
03227         }
03228     }
03229 
03230 
03231     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
03232     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
03233     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
03234     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
03235     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
03236     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
03237     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
03238     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
03239     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
03240 
03241     for(i=0; i<64; i++){
03242         if(!c->put_2tap_qpel_pixels_tab[0][i])
03243             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
03244         if(!c->avg_2tap_qpel_pixels_tab[0][i])
03245             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
03246     }
03247 
03248     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
03249     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
03250     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
03251     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
03252 
03253     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
03254     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
03255     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
03256     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
03257 
03258     switch(c->idct_permutation_type){
03259     case FF_NO_IDCT_PERM:
03260         for(i=0; i<64; i++)
03261             c->idct_permutation[i]= i;
03262         break;
03263     case FF_LIBMPEG2_IDCT_PERM:
03264         for(i=0; i<64; i++)
03265             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
03266         break;
03267     case FF_SIMPLE_IDCT_PERM:
03268         for(i=0; i<64; i++)
03269             c->idct_permutation[i]= simple_mmx_permutation[i];
03270         break;
03271     case FF_TRANSPOSE_IDCT_PERM:
03272         for(i=0; i<64; i++)
03273             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
03274         break;
03275     case FF_PARTTRANS_IDCT_PERM:
03276         for(i=0; i<64; i++)
03277             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
03278         break;
03279     case FF_SSE2_IDCT_PERM:
03280         for(i=0; i<64; i++)
03281             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
03282         break;
03283     default:
03284         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
03285     }
03286 }
03287