libavcodec/dsputil.c
Go to the documentation of this file.
00001 /*
00002  * DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of Libav.
00009  *
00010  * Libav is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * Libav is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with Libav; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00030 #include "libavutil/imgutils.h"
00031 #include "avcodec.h"
00032 #include "dsputil.h"
00033 #include "simple_idct.h"
00034 #include "faandct.h"
00035 #include "faanidct.h"
00036 #include "mathops.h"
00037 #include "mpegvideo.h"
00038 #include "config.h"
00039 #include "ac3dec.h"
00040 #include "vorbis.h"
00041 #include "png.h"
00042 
00043 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
00044 uint32_t ff_squareTbl[512] = {0, };
00045 
00046 #define BIT_DEPTH 9
00047 #include "dsputil_template.c"
00048 #undef BIT_DEPTH
00049 
00050 #define BIT_DEPTH 10
00051 #include "dsputil_template.c"
00052 #undef BIT_DEPTH
00053 
00054 #define BIT_DEPTH 8
00055 #include "dsputil_template.c"
00056 
00057 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
00058 #define pb_7f (~0UL/255 * 0x7f)
00059 #define pb_80 (~0UL/255 * 0x80)
00060 
00061 const uint8_t ff_zigzag_direct[64] = {
00062     0,   1,  8, 16,  9,  2,  3, 10,
00063     17, 24, 32, 25, 18, 11,  4,  5,
00064     12, 19, 26, 33, 40, 48, 41, 34,
00065     27, 20, 13,  6,  7, 14, 21, 28,
00066     35, 42, 49, 56, 57, 50, 43, 36,
00067     29, 22, 15, 23, 30, 37, 44, 51,
00068     58, 59, 52, 45, 38, 31, 39, 46,
00069     53, 60, 61, 54, 47, 55, 62, 63
00070 };
00071 
00072 /* Specific zigzag scan for 248 idct. NOTE that unlike the
00073    specification, we interleave the fields */
00074 const uint8_t ff_zigzag248_direct[64] = {
00075      0,  8,  1,  9, 16, 24,  2, 10,
00076     17, 25, 32, 40, 48, 56, 33, 41,
00077     18, 26,  3, 11,  4, 12, 19, 27,
00078     34, 42, 49, 57, 50, 58, 35, 43,
00079     20, 28,  5, 13,  6, 14, 21, 29,
00080     36, 44, 51, 59, 52, 60, 37, 45,
00081     22, 30,  7, 15, 23, 31, 38, 46,
00082     53, 61, 54, 62, 39, 47, 55, 63,
00083 };
00084 
00085 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
00086 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
00087 
00088 const uint8_t ff_alternate_horizontal_scan[64] = {
00089     0,  1,   2,  3,  8,  9, 16, 17,
00090     10, 11,  4,  5,  6,  7, 15, 14,
00091     13, 12, 19, 18, 24, 25, 32, 33,
00092     26, 27, 20, 21, 22, 23, 28, 29,
00093     30, 31, 34, 35, 40, 41, 48, 49,
00094     42, 43, 36, 37, 38, 39, 44, 45,
00095     46, 47, 50, 51, 56, 57, 58, 59,
00096     52, 53, 54, 55, 60, 61, 62, 63,
00097 };
00098 
00099 const uint8_t ff_alternate_vertical_scan[64] = {
00100     0,  8,  16, 24,  1,  9,  2, 10,
00101     17, 25, 32, 40, 48, 56, 57, 49,
00102     41, 33, 26, 18,  3, 11,  4, 12,
00103     19, 27, 34, 42, 50, 58, 35, 43,
00104     51, 59, 20, 28,  5, 13,  6, 14,
00105     21, 29, 36, 44, 52, 60, 37, 45,
00106     53, 61, 22, 30,  7, 15, 23, 31,
00107     38, 46, 54, 62, 39, 47, 55, 63,
00108 };
00109 
00110 /* Input permutation for the simple_idct_mmx */
00111 static const uint8_t simple_mmx_permutation[64]={
00112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00120 };
00121 
00122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00123 
00124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
00125     int i;
00126     int end;
00127 
00128     st->scantable= src_scantable;
00129 
00130     for(i=0; i<64; i++){
00131         int j;
00132         j = src_scantable[i];
00133         st->permutated[i] = permutation[j];
00134 #if ARCH_PPC
00135         st->inverse[j] = i;
00136 #endif
00137     }
00138 
00139     end=-1;
00140     for(i=0; i<64; i++){
00141         int j;
00142         j = st->permutated[i];
00143         if(j>end) end=j;
00144         st->raster_end[i]= end;
00145     }
00146 }
00147 
00148 void ff_init_scantable_permutation(uint8_t *idct_permutation,
00149                                    int idct_permutation_type)
00150 {
00151     int i;
00152 
00153     switch(idct_permutation_type){
00154     case FF_NO_IDCT_PERM:
00155         for(i=0; i<64; i++)
00156             idct_permutation[i]= i;
00157         break;
00158     case FF_LIBMPEG2_IDCT_PERM:
00159         for(i=0; i<64; i++)
00160             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00161         break;
00162     case FF_SIMPLE_IDCT_PERM:
00163         for(i=0; i<64; i++)
00164             idct_permutation[i]= simple_mmx_permutation[i];
00165         break;
00166     case FF_TRANSPOSE_IDCT_PERM:
00167         for(i=0; i<64; i++)
00168             idct_permutation[i]= ((i&7)<<3) | (i>>3);
00169         break;
00170     case FF_PARTTRANS_IDCT_PERM:
00171         for(i=0; i<64; i++)
00172             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
00173         break;
00174     case FF_SSE2_IDCT_PERM:
00175         for(i=0; i<64; i++)
00176             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
00177         break;
00178     default:
00179         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
00180     }
00181 }
00182 
00183 static int pix_sum_c(uint8_t * pix, int line_size)
00184 {
00185     int s, i, j;
00186 
00187     s = 0;
00188     for (i = 0; i < 16; i++) {
00189         for (j = 0; j < 16; j += 8) {
00190             s += pix[0];
00191             s += pix[1];
00192             s += pix[2];
00193             s += pix[3];
00194             s += pix[4];
00195             s += pix[5];
00196             s += pix[6];
00197             s += pix[7];
00198             pix += 8;
00199         }
00200         pix += line_size - 16;
00201     }
00202     return s;
00203 }
00204 
00205 static int pix_norm1_c(uint8_t * pix, int line_size)
00206 {
00207     int s, i, j;
00208     uint32_t *sq = ff_squareTbl + 256;
00209 
00210     s = 0;
00211     for (i = 0; i < 16; i++) {
00212         for (j = 0; j < 16; j += 8) {
00213 #if 0
00214             s += sq[pix[0]];
00215             s += sq[pix[1]];
00216             s += sq[pix[2]];
00217             s += sq[pix[3]];
00218             s += sq[pix[4]];
00219             s += sq[pix[5]];
00220             s += sq[pix[6]];
00221             s += sq[pix[7]];
00222 #else
00223 #if HAVE_FAST_64BIT
00224             register uint64_t x=*(uint64_t*)pix;
00225             s += sq[x&0xff];
00226             s += sq[(x>>8)&0xff];
00227             s += sq[(x>>16)&0xff];
00228             s += sq[(x>>24)&0xff];
00229             s += sq[(x>>32)&0xff];
00230             s += sq[(x>>40)&0xff];
00231             s += sq[(x>>48)&0xff];
00232             s += sq[(x>>56)&0xff];
00233 #else
00234             register uint32_t x=*(uint32_t*)pix;
00235             s += sq[x&0xff];
00236             s += sq[(x>>8)&0xff];
00237             s += sq[(x>>16)&0xff];
00238             s += sq[(x>>24)&0xff];
00239             x=*(uint32_t*)(pix+4);
00240             s += sq[x&0xff];
00241             s += sq[(x>>8)&0xff];
00242             s += sq[(x>>16)&0xff];
00243             s += sq[(x>>24)&0xff];
00244 #endif
00245 #endif
00246             pix += 8;
00247         }
00248         pix += line_size - 16;
00249     }
00250     return s;
00251 }
00252 
00253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
00254     int i;
00255 
00256     for(i=0; i+8<=w; i+=8){
00257         dst[i+0]= av_bswap32(src[i+0]);
00258         dst[i+1]= av_bswap32(src[i+1]);
00259         dst[i+2]= av_bswap32(src[i+2]);
00260         dst[i+3]= av_bswap32(src[i+3]);
00261         dst[i+4]= av_bswap32(src[i+4]);
00262         dst[i+5]= av_bswap32(src[i+5]);
00263         dst[i+6]= av_bswap32(src[i+6]);
00264         dst[i+7]= av_bswap32(src[i+7]);
00265     }
00266     for(;i<w; i++){
00267         dst[i+0]= av_bswap32(src[i+0]);
00268     }
00269 }
00270 
00271 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
00272 {
00273     while (len--)
00274         *dst++ = av_bswap16(*src++);
00275 }
00276 
00277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00278 {
00279     int s, i;
00280     uint32_t *sq = ff_squareTbl + 256;
00281 
00282     s = 0;
00283     for (i = 0; i < h; i++) {
00284         s += sq[pix1[0] - pix2[0]];
00285         s += sq[pix1[1] - pix2[1]];
00286         s += sq[pix1[2] - pix2[2]];
00287         s += sq[pix1[3] - pix2[3]];
00288         pix1 += line_size;
00289         pix2 += line_size;
00290     }
00291     return s;
00292 }
00293 
00294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00295 {
00296     int s, i;
00297     uint32_t *sq = ff_squareTbl + 256;
00298 
00299     s = 0;
00300     for (i = 0; i < h; i++) {
00301         s += sq[pix1[0] - pix2[0]];
00302         s += sq[pix1[1] - pix2[1]];
00303         s += sq[pix1[2] - pix2[2]];
00304         s += sq[pix1[3] - pix2[3]];
00305         s += sq[pix1[4] - pix2[4]];
00306         s += sq[pix1[5] - pix2[5]];
00307         s += sq[pix1[6] - pix2[6]];
00308         s += sq[pix1[7] - pix2[7]];
00309         pix1 += line_size;
00310         pix2 += line_size;
00311     }
00312     return s;
00313 }
00314 
00315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00316 {
00317     int s, i;
00318     uint32_t *sq = ff_squareTbl + 256;
00319 
00320     s = 0;
00321     for (i = 0; i < h; i++) {
00322         s += sq[pix1[ 0] - pix2[ 0]];
00323         s += sq[pix1[ 1] - pix2[ 1]];
00324         s += sq[pix1[ 2] - pix2[ 2]];
00325         s += sq[pix1[ 3] - pix2[ 3]];
00326         s += sq[pix1[ 4] - pix2[ 4]];
00327         s += sq[pix1[ 5] - pix2[ 5]];
00328         s += sq[pix1[ 6] - pix2[ 6]];
00329         s += sq[pix1[ 7] - pix2[ 7]];
00330         s += sq[pix1[ 8] - pix2[ 8]];
00331         s += sq[pix1[ 9] - pix2[ 9]];
00332         s += sq[pix1[10] - pix2[10]];
00333         s += sq[pix1[11] - pix2[11]];
00334         s += sq[pix1[12] - pix2[12]];
00335         s += sq[pix1[13] - pix2[13]];
00336         s += sq[pix1[14] - pix2[14]];
00337         s += sq[pix1[15] - pix2[15]];
00338 
00339         pix1 += line_size;
00340         pix2 += line_size;
00341     }
00342     return s;
00343 }
00344 
00345 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
00346                           const uint8_t *s2, int stride){
00347     int i;
00348 
00349     /* read the pixels */
00350     for(i=0;i<8;i++) {
00351         block[0] = s1[0] - s2[0];
00352         block[1] = s1[1] - s2[1];
00353         block[2] = s1[2] - s2[2];
00354         block[3] = s1[3] - s2[3];
00355         block[4] = s1[4] - s2[4];
00356         block[5] = s1[5] - s2[5];
00357         block[6] = s1[6] - s2[6];
00358         block[7] = s1[7] - s2[7];
00359         s1 += stride;
00360         s2 += stride;
00361         block += 8;
00362     }
00363 }
00364 
00365 
00366 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00367                              int line_size)
00368 {
00369     int i;
00370 
00371     /* read the pixels */
00372     for(i=0;i<8;i++) {
00373         pixels[0] = av_clip_uint8(block[0]);
00374         pixels[1] = av_clip_uint8(block[1]);
00375         pixels[2] = av_clip_uint8(block[2]);
00376         pixels[3] = av_clip_uint8(block[3]);
00377         pixels[4] = av_clip_uint8(block[4]);
00378         pixels[5] = av_clip_uint8(block[5]);
00379         pixels[6] = av_clip_uint8(block[6]);
00380         pixels[7] = av_clip_uint8(block[7]);
00381 
00382         pixels += line_size;
00383         block += 8;
00384     }
00385 }
00386 
00387 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00388                                  int line_size)
00389 {
00390     int i;
00391 
00392     /* read the pixels */
00393     for(i=0;i<4;i++) {
00394         pixels[0] = av_clip_uint8(block[0]);
00395         pixels[1] = av_clip_uint8(block[1]);
00396         pixels[2] = av_clip_uint8(block[2]);
00397         pixels[3] = av_clip_uint8(block[3]);
00398 
00399         pixels += line_size;
00400         block += 8;
00401     }
00402 }
00403 
00404 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00405                                  int line_size)
00406 {
00407     int i;
00408 
00409     /* read the pixels */
00410     for(i=0;i<2;i++) {
00411         pixels[0] = av_clip_uint8(block[0]);
00412         pixels[1] = av_clip_uint8(block[1]);
00413 
00414         pixels += line_size;
00415         block += 8;
00416     }
00417 }
00418 
00419 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
00420                                     uint8_t *restrict pixels,
00421                                     int line_size)
00422 {
00423     int i, j;
00424 
00425     for (i = 0; i < 8; i++) {
00426         for (j = 0; j < 8; j++) {
00427             if (*block < -128)
00428                 *pixels = 0;
00429             else if (*block > 127)
00430                 *pixels = 255;
00431             else
00432                 *pixels = (uint8_t)(*block + 128);
00433             block++;
00434             pixels++;
00435         }
00436         pixels += (line_size - 8);
00437     }
00438 }
00439 
00440 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00441                              int line_size)
00442 {
00443     int i;
00444 
00445     /* read the pixels */
00446     for(i=0;i<8;i++) {
00447         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
00448         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
00449         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
00450         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
00451         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
00452         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
00453         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
00454         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
00455         pixels += line_size;
00456         block += 8;
00457     }
00458 }
00459 
00460 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00461                           int line_size)
00462 {
00463     int i;
00464 
00465     /* read the pixels */
00466     for(i=0;i<4;i++) {
00467         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
00468         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
00469         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
00470         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
00471         pixels += line_size;
00472         block += 8;
00473     }
00474 }
00475 
00476 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00477                           int line_size)
00478 {
00479     int i;
00480 
00481     /* read the pixels */
00482     for(i=0;i<2;i++) {
00483         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
00484         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
00485         pixels += line_size;
00486         block += 8;
00487     }
00488 }
00489 
00490 static int sum_abs_dctelem_c(DCTELEM *block)
00491 {
00492     int sum=0, i;
00493     for(i=0; i<64; i++)
00494         sum+= FFABS(block[i]);
00495     return sum;
00496 }
00497 
00498 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
00499 {
00500     int i;
00501 
00502     for (i = 0; i < h; i++) {
00503         memset(block, value, 16);
00504         block += line_size;
00505     }
00506 }
00507 
00508 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
00509 {
00510     int i;
00511 
00512     for (i = 0; i < h; i++) {
00513         memset(block, value, 8);
00514         block += line_size;
00515     }
00516 }
00517 
00518 #define avg2(a,b) ((a+b+1)>>1)
00519 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
00520 
00521 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
00522 {
00523     const int A=(16-x16)*(16-y16);
00524     const int B=(   x16)*(16-y16);
00525     const int C=(16-x16)*(   y16);
00526     const int D=(   x16)*(   y16);
00527     int i;
00528 
00529     for(i=0; i<h; i++)
00530     {
00531         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
00532         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
00533         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
00534         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
00535         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
00536         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
00537         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
00538         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
00539         dst+= stride;
00540         src+= stride;
00541     }
00542 }
00543 
00544 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
00545                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
00546 {
00547     int y, vx, vy;
00548     const int s= 1<<shift;
00549 
00550     width--;
00551     height--;
00552 
00553     for(y=0; y<h; y++){
00554         int x;
00555 
00556         vx= ox;
00557         vy= oy;
00558         for(x=0; x<8; x++){ //XXX FIXME optimize
00559             int src_x, src_y, frac_x, frac_y, index;
00560 
00561             src_x= vx>>16;
00562             src_y= vy>>16;
00563             frac_x= src_x&(s-1);
00564             frac_y= src_y&(s-1);
00565             src_x>>=shift;
00566             src_y>>=shift;
00567 
00568             if((unsigned)src_x < width){
00569                 if((unsigned)src_y < height){
00570                     index= src_x + src_y*stride;
00571                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
00572                                            + src[index       +1]*   frac_x )*(s-frac_y)
00573                                         + (  src[index+stride  ]*(s-frac_x)
00574                                            + src[index+stride+1]*   frac_x )*   frac_y
00575                                         + r)>>(shift*2);
00576                 }else{
00577                     index= src_x + av_clip(src_y, 0, height)*stride;
00578                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
00579                                           + src[index       +1]*   frac_x )*s
00580                                         + r)>>(shift*2);
00581                 }
00582             }else{
00583                 if((unsigned)src_y < height){
00584                     index= av_clip(src_x, 0, width) + src_y*stride;
00585                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
00586                                            + src[index+stride  ]*   frac_y )*s
00587                                         + r)>>(shift*2);
00588                 }else{
00589                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
00590                     dst[y*stride + x]=    src[index         ];
00591                 }
00592             }
00593 
00594             vx+= dxx;
00595             vy+= dyx;
00596         }
00597         ox += dxy;
00598         oy += dyy;
00599     }
00600 }
00601 
00602 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00603     switch(width){
00604     case 2: put_pixels2_8_c (dst, src, stride, height); break;
00605     case 4: put_pixels4_8_c (dst, src, stride, height); break;
00606     case 8: put_pixels8_8_c (dst, src, stride, height); break;
00607     case 16:put_pixels16_8_c(dst, src, stride, height); break;
00608     }
00609 }
00610 
00611 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00612     int i,j;
00613     for (i=0; i < height; i++) {
00614       for (j=0; j < width; j++) {
00615         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
00616       }
00617       src += stride;
00618       dst += stride;
00619     }
00620 }
00621 
00622 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00623     int i,j;
00624     for (i=0; i < height; i++) {
00625       for (j=0; j < width; j++) {
00626         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
00627       }
00628       src += stride;
00629       dst += stride;
00630     }
00631 }
00632 
00633 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00634     int i,j;
00635     for (i=0; i < height; i++) {
00636       for (j=0; j < width; j++) {
00637         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
00638       }
00639       src += stride;
00640       dst += stride;
00641     }
00642 }
00643 
00644 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00645     int i,j;
00646     for (i=0; i < height; i++) {
00647       for (j=0; j < width; j++) {
00648         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
00649       }
00650       src += stride;
00651       dst += stride;
00652     }
00653 }
00654 
00655 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00656     int i,j;
00657     for (i=0; i < height; i++) {
00658       for (j=0; j < width; j++) {
00659         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
00660       }
00661       src += stride;
00662       dst += stride;
00663     }
00664 }
00665 
00666 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00667     int i,j;
00668     for (i=0; i < height; i++) {
00669       for (j=0; j < width; j++) {
00670         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
00671       }
00672       src += stride;
00673       dst += stride;
00674     }
00675 }
00676 
00677 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00678     int i,j;
00679     for (i=0; i < height; i++) {
00680       for (j=0; j < width; j++) {
00681         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
00682       }
00683       src += stride;
00684       dst += stride;
00685     }
00686 }
00687 
00688 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00689     int i,j;
00690     for (i=0; i < height; i++) {
00691       for (j=0; j < width; j++) {
00692         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
00693       }
00694       src += stride;
00695       dst += stride;
00696     }
00697 }
00698 
00699 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00700     switch(width){
00701     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
00702     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
00703     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
00704     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
00705     }
00706 }
00707 
00708 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00709     int i,j;
00710     for (i=0; i < height; i++) {
00711       for (j=0; j < width; j++) {
00712         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
00713       }
00714       src += stride;
00715       dst += stride;
00716     }
00717 }
00718 
00719 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00720     int i,j;
00721     for (i=0; i < height; i++) {
00722       for (j=0; j < width; j++) {
00723         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
00724       }
00725       src += stride;
00726       dst += stride;
00727     }
00728 }
00729 
00730 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00731     int i,j;
00732     for (i=0; i < height; i++) {
00733       for (j=0; j < width; j++) {
00734         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
00735       }
00736       src += stride;
00737       dst += stride;
00738     }
00739 }
00740 
00741 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00742     int i,j;
00743     for (i=0; i < height; i++) {
00744       for (j=0; j < width; j++) {
00745         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00746       }
00747       src += stride;
00748       dst += stride;
00749     }
00750 }
00751 
00752 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00753     int i,j;
00754     for (i=0; i < height; i++) {
00755       for (j=0; j < width; j++) {
00756         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00757       }
00758       src += stride;
00759       dst += stride;
00760     }
00761 }
00762 
00763 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00764     int i,j;
00765     for (i=0; i < height; i++) {
00766       for (j=0; j < width; j++) {
00767         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
00768       }
00769       src += stride;
00770       dst += stride;
00771     }
00772 }
00773 
00774 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00775     int i,j;
00776     for (i=0; i < height; i++) {
00777       for (j=0; j < width; j++) {
00778         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00779       }
00780       src += stride;
00781       dst += stride;
00782     }
00783 }
00784 
00785 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
00786     int i,j;
00787     for (i=0; i < height; i++) {
00788       for (j=0; j < width; j++) {
00789         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
00790       }
00791       src += stride;
00792       dst += stride;
00793     }
00794 }
00795 
00796 #define QPEL_MC(r, OPNAME, RND, OP) \
00797 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00798     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00799     int i;\
00800     for(i=0; i<h; i++)\
00801     {\
00802         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
00803         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
00804         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
00805         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
00806         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
00807         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
00808         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
00809         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
00810         dst+=dstStride;\
00811         src+=srcStride;\
00812     }\
00813 }\
00814 \
00815 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00816     const int w=8;\
00817     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00818     int i;\
00819     for(i=0; i<w; i++)\
00820     {\
00821         const int src0= src[0*srcStride];\
00822         const int src1= src[1*srcStride];\
00823         const int src2= src[2*srcStride];\
00824         const int src3= src[3*srcStride];\
00825         const int src4= src[4*srcStride];\
00826         const int src5= src[5*srcStride];\
00827         const int src6= src[6*srcStride];\
00828         const int src7= src[7*srcStride];\
00829         const int src8= src[8*srcStride];\
00830         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
00831         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
00832         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
00833         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
00834         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
00835         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
00836         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
00837         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
00838         dst++;\
00839         src++;\
00840     }\
00841 }\
00842 \
00843 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00844     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00845     int i;\
00846     \
00847     for(i=0; i<h; i++)\
00848     {\
00849         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
00850         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
00851         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
00852         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
00853         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
00854         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
00855         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
00856         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
00857         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
00858         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
00859         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
00860         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
00861         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
00862         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
00863         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
00864         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
00865         dst+=dstStride;\
00866         src+=srcStride;\
00867     }\
00868 }\
00869 \
00870 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00871     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
00872     int i;\
00873     const int w=16;\
00874     for(i=0; i<w; i++)\
00875     {\
00876         const int src0= src[0*srcStride];\
00877         const int src1= src[1*srcStride];\
00878         const int src2= src[2*srcStride];\
00879         const int src3= src[3*srcStride];\
00880         const int src4= src[4*srcStride];\
00881         const int src5= src[5*srcStride];\
00882         const int src6= src[6*srcStride];\
00883         const int src7= src[7*srcStride];\
00884         const int src8= src[8*srcStride];\
00885         const int src9= src[9*srcStride];\
00886         const int src10= src[10*srcStride];\
00887         const int src11= src[11*srcStride];\
00888         const int src12= src[12*srcStride];\
00889         const int src13= src[13*srcStride];\
00890         const int src14= src[14*srcStride];\
00891         const int src15= src[15*srcStride];\
00892         const int src16= src[16*srcStride];\
00893         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
00894         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
00895         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
00896         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
00897         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
00898         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
00899         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
00900         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
00901         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
00902         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
00903         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
00904         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
00905         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
00906         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
00907         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
00908         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
00909         dst++;\
00910         src++;\
00911     }\
00912 }\
00913 \
00914 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
00915     uint8_t half[64];\
00916     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
00917     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
00918 }\
00919 \
00920 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
00921     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
00922 }\
00923 \
00924 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
00925     uint8_t half[64];\
00926     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
00927     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
00928 }\
00929 \
00930 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
00931     uint8_t full[16*9];\
00932     uint8_t half[64];\
00933     copy_block9(full, src, 16, stride, 9);\
00934     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
00935     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
00936 }\
00937 \
00938 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
00939     uint8_t full[16*9];\
00940     copy_block9(full, src, 16, stride, 9);\
00941     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
00942 }\
00943 \
00944 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
00945     uint8_t full[16*9];\
00946     uint8_t half[64];\
00947     copy_block9(full, src, 16, stride, 9);\
00948     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
00949     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
00950 }\
00951 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
00952     uint8_t full[16*9];\
00953     uint8_t halfH[72];\
00954     uint8_t halfV[64];\
00955     uint8_t halfHV[64];\
00956     copy_block9(full, src, 16, stride, 9);\
00957     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
00958     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
00959     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
00960     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
00961 }\
00962 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
00963     uint8_t full[16*9];\
00964     uint8_t halfH[72];\
00965     uint8_t halfHV[64];\
00966     copy_block9(full, src, 16, stride, 9);\
00967     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
00968     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
00969     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
00970     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
00971 }\
00972 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
00973     uint8_t full[16*9];\
00974     uint8_t halfH[72];\
00975     uint8_t halfV[64];\
00976     uint8_t halfHV[64];\
00977     copy_block9(full, src, 16, stride, 9);\
00978     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
00979     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
00980     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
00981     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
00982 }\
00983 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
00984     uint8_t full[16*9];\
00985     uint8_t halfH[72];\
00986     uint8_t halfHV[64];\
00987     copy_block9(full, src, 16, stride, 9);\
00988     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
00989     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
00990     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
00991     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
00992 }\
00993 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
00994     uint8_t full[16*9];\
00995     uint8_t halfH[72];\
00996     uint8_t halfV[64];\
00997     uint8_t halfHV[64];\
00998     copy_block9(full, src, 16, stride, 9);\
00999     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01000     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01001     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01002     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01003 }\
01004 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
01005     uint8_t full[16*9];\
01006     uint8_t halfH[72];\
01007     uint8_t halfHV[64];\
01008     copy_block9(full, src, 16, stride, 9);\
01009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01010     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
01011     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01012     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01013 }\
01014 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
01015     uint8_t full[16*9];\
01016     uint8_t halfH[72];\
01017     uint8_t halfV[64];\
01018     uint8_t halfHV[64];\
01019     copy_block9(full, src, 16, stride, 9);\
01020     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
01021     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01022     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01023     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01024 }\
01025 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
01026     uint8_t full[16*9];\
01027     uint8_t halfH[72];\
01028     uint8_t halfHV[64];\
01029     copy_block9(full, src, 16, stride, 9);\
01030     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01031     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
01032     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01033     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01034 }\
01035 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
01036     uint8_t halfH[72];\
01037     uint8_t halfHV[64];\
01038     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01039     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01040     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
01041 }\
01042 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
01043     uint8_t halfH[72];\
01044     uint8_t halfHV[64];\
01045     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01046     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01047     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01048 }\
01049 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
01050     uint8_t full[16*9];\
01051     uint8_t halfH[72];\
01052     uint8_t halfV[64];\
01053     uint8_t halfHV[64];\
01054     copy_block9(full, src, 16, stride, 9);\
01055     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01056     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01057     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01058     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
01059 }\
01060 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
01061     uint8_t full[16*9];\
01062     uint8_t halfH[72];\
01063     copy_block9(full, src, 16, stride, 9);\
01064     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01065     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
01066     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01067 }\
01068 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
01069     uint8_t full[16*9];\
01070     uint8_t halfH[72];\
01071     uint8_t halfV[64];\
01072     uint8_t halfHV[64];\
01073     copy_block9(full, src, 16, stride, 9);\
01074     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01075     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01076     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01077     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
01078 }\
01079 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
01080     uint8_t full[16*9];\
01081     uint8_t halfH[72];\
01082     copy_block9(full, src, 16, stride, 9);\
01083     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01084     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
01085     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01086 }\
01087 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
01088     uint8_t halfH[72];\
01089     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01090     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01091 }\
01092 \
01093 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
01094     uint8_t half[256];\
01095     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
01096     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
01097 }\
01098 \
01099 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
01100     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
01101 }\
01102 \
01103 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
01104     uint8_t half[256];\
01105     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
01106     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
01107 }\
01108 \
01109 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
01110     uint8_t full[24*17];\
01111     uint8_t half[256];\
01112     copy_block17(full, src, 24, stride, 17);\
01113     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
01114     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
01115 }\
01116 \
01117 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
01118     uint8_t full[24*17];\
01119     copy_block17(full, src, 24, stride, 17);\
01120     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
01121 }\
01122 \
01123 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
01124     uint8_t full[24*17];\
01125     uint8_t half[256];\
01126     copy_block17(full, src, 24, stride, 17);\
01127     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
01128     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
01129 }\
01130 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
01131     uint8_t full[24*17];\
01132     uint8_t halfH[272];\
01133     uint8_t halfV[256];\
01134     uint8_t halfHV[256];\
01135     copy_block17(full, src, 24, stride, 17);\
01136     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01137     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
01138     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01139     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01140 }\
01141 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
01142     uint8_t full[24*17];\
01143     uint8_t halfH[272];\
01144     uint8_t halfHV[256];\
01145     copy_block17(full, src, 24, stride, 17);\
01146     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01147     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
01148     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01149     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
01150 }\
01151 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
01152     uint8_t full[24*17];\
01153     uint8_t halfH[272];\
01154     uint8_t halfV[256];\
01155     uint8_t halfHV[256];\
01156     copy_block17(full, src, 24, stride, 17);\
01157     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01158     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
01159     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01160     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01161 }\
01162 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
01163     uint8_t full[24*17];\
01164     uint8_t halfH[272];\
01165     uint8_t halfHV[256];\
01166     copy_block17(full, src, 24, stride, 17);\
01167     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01168     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
01169     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01170     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
01171 }\
01172 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
01173     uint8_t full[24*17];\
01174     uint8_t halfH[272];\
01175     uint8_t halfV[256];\
01176     uint8_t halfHV[256];\
01177     copy_block17(full, src, 24, stride, 17);\
01178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01179     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
01180     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01181     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01182 }\
01183 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
01184     uint8_t full[24*17];\
01185     uint8_t halfH[272];\
01186     uint8_t halfHV[256];\
01187     copy_block17(full, src, 24, stride, 17);\
01188     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01189     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
01190     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01191     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
01192 }\
01193 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
01194     uint8_t full[24*17];\
01195     uint8_t halfH[272];\
01196     uint8_t halfV[256];\
01197     uint8_t halfHV[256];\
01198     copy_block17(full, src, 24, stride, 17);\
01199     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
01200     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
01201     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01202     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
01203 }\
01204 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
01205     uint8_t full[24*17];\
01206     uint8_t halfH[272];\
01207     uint8_t halfHV[256];\
01208     copy_block17(full, src, 24, stride, 17);\
01209     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01210     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
01211     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01212     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
01213 }\
01214 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
01215     uint8_t halfH[272];\
01216     uint8_t halfHV[256];\
01217     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
01218     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01219     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
01220 }\
01221 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
01222     uint8_t halfH[272];\
01223     uint8_t halfHV[256];\
01224     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
01225     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01226     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
01227 }\
01228 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
01229     uint8_t full[24*17];\
01230     uint8_t halfH[272];\
01231     uint8_t halfV[256];\
01232     uint8_t halfHV[256];\
01233     copy_block17(full, src, 24, stride, 17);\
01234     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01235     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
01236     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01237     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
01238 }\
01239 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
01240     uint8_t full[24*17];\
01241     uint8_t halfH[272];\
01242     copy_block17(full, src, 24, stride, 17);\
01243     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01244     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
01245     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
01246 }\
01247 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
01248     uint8_t full[24*17];\
01249     uint8_t halfH[272];\
01250     uint8_t halfV[256];\
01251     uint8_t halfHV[256];\
01252     copy_block17(full, src, 24, stride, 17);\
01253     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01254     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
01255     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
01256     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
01257 }\
01258 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
01259     uint8_t full[24*17];\
01260     uint8_t halfH[272];\
01261     copy_block17(full, src, 24, stride, 17);\
01262     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
01263     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
01264     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
01265 }\
01266 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
01267     uint8_t halfH[272];\
01268     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
01269     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
01270 }
01271 
01272 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
01273 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
01274 #define op_put(a, b) a = cm[((b) + 16)>>5]
01275 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
01276 
01277 QPEL_MC(0, put_       , _       , op_put)
01278 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
01279 QPEL_MC(0, avg_       , _       , op_avg)
01280 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
01281 #undef op_avg
01282 #undef op_avg_no_rnd
01283 #undef op_put
01284 #undef op_put_no_rnd
01285 
01286 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
01287 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
01288 #define put_qpel16_mc00_c ff_put_pixels16x16_c
01289 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
01290 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
01291 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
01292 
01293 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
01294     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
01295     int i;
01296 
01297     for(i=0; i<h; i++){
01298         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
01299         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
01300         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
01301         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
01302         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
01303         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
01304         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
01305         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
01306         dst+=dstStride;
01307         src+=srcStride;
01308     }
01309 }
01310 
01311 #if CONFIG_RV40_DECODER
01312 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01313     put_pixels16_xy2_8_c(dst, src, stride, 16);
01314 }
01315 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01316     avg_pixels16_xy2_8_c(dst, src, stride, 16);
01317 }
01318 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01319     put_pixels8_xy2_8_c(dst, src, stride, 8);
01320 }
01321 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
01322     avg_pixels8_xy2_8_c(dst, src, stride, 8);
01323 }
01324 #endif /* CONFIG_RV40_DECODER */
01325 
01326 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
01327     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
01328     int i;
01329 
01330     for(i=0; i<w; i++){
01331         const int src_1= src[ -srcStride];
01332         const int src0 = src[0          ];
01333         const int src1 = src[  srcStride];
01334         const int src2 = src[2*srcStride];
01335         const int src3 = src[3*srcStride];
01336         const int src4 = src[4*srcStride];
01337         const int src5 = src[5*srcStride];
01338         const int src6 = src[6*srcStride];
01339         const int src7 = src[7*srcStride];
01340         const int src8 = src[8*srcStride];
01341         const int src9 = src[9*srcStride];
01342         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
01343         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
01344         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
01345         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
01346         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
01347         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
01348         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
01349         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
01350         src++;
01351         dst++;
01352     }
01353 }
01354 
01355 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
01356     uint8_t half[64];
01357     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
01358     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
01359 }
01360 
01361 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
01362     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
01363 }
01364 
01365 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
01366     uint8_t half[64];
01367     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
01368     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
01369 }
01370 
01371 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
01372     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
01373 }
01374 
01375 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
01376     uint8_t halfH[88];
01377     uint8_t halfV[64];
01378     uint8_t halfHV[64];
01379     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
01380     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
01381     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
01382     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
01383 }
01384 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
01385     uint8_t halfH[88];
01386     uint8_t halfV[64];
01387     uint8_t halfHV[64];
01388     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
01389     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
01390     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
01391     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
01392 }
01393 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
01394     uint8_t halfH[88];
01395     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
01396     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
01397 }
01398 
01399 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
01400     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
01401     int x;
01402     const int strength= ff_h263_loop_filter_strength[qscale];
01403 
01404     for(x=0; x<8; x++){
01405         int d1, d2, ad1;
01406         int p0= src[x-2*stride];
01407         int p1= src[x-1*stride];
01408         int p2= src[x+0*stride];
01409         int p3= src[x+1*stride];
01410         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
01411 
01412         if     (d<-2*strength) d1= 0;
01413         else if(d<-  strength) d1=-2*strength - d;
01414         else if(d<   strength) d1= d;
01415         else if(d< 2*strength) d1= 2*strength - d;
01416         else                   d1= 0;
01417 
01418         p1 += d1;
01419         p2 -= d1;
01420         if(p1&256) p1= ~(p1>>31);
01421         if(p2&256) p2= ~(p2>>31);
01422 
01423         src[x-1*stride] = p1;
01424         src[x+0*stride] = p2;
01425 
01426         ad1= FFABS(d1)>>1;
01427 
01428         d2= av_clip((p0-p3)/4, -ad1, ad1);
01429 
01430         src[x-2*stride] = p0 - d2;
01431         src[x+  stride] = p3 + d2;
01432     }
01433     }
01434 }
01435 
01436 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
01437     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
01438     int y;
01439     const int strength= ff_h263_loop_filter_strength[qscale];
01440 
01441     for(y=0; y<8; y++){
01442         int d1, d2, ad1;
01443         int p0= src[y*stride-2];
01444         int p1= src[y*stride-1];
01445         int p2= src[y*stride+0];
01446         int p3= src[y*stride+1];
01447         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
01448 
01449         if     (d<-2*strength) d1= 0;
01450         else if(d<-  strength) d1=-2*strength - d;
01451         else if(d<   strength) d1= d;
01452         else if(d< 2*strength) d1= 2*strength - d;
01453         else                   d1= 0;
01454 
01455         p1 += d1;
01456         p2 -= d1;
01457         if(p1&256) p1= ~(p1>>31);
01458         if(p2&256) p2= ~(p2>>31);
01459 
01460         src[y*stride-1] = p1;
01461         src[y*stride+0] = p2;
01462 
01463         ad1= FFABS(d1)>>1;
01464 
01465         d2= av_clip((p0-p3)/4, -ad1, ad1);
01466 
01467         src[y*stride-2] = p0 - d2;
01468         src[y*stride+1] = p3 + d2;
01469     }
01470     }
01471 }
01472 
01473 static void h261_loop_filter_c(uint8_t *src, int stride){
01474     int x,y,xy,yz;
01475     int temp[64];
01476 
01477     for(x=0; x<8; x++){
01478         temp[x      ] = 4*src[x           ];
01479         temp[x + 7*8] = 4*src[x + 7*stride];
01480     }
01481     for(y=1; y<7; y++){
01482         for(x=0; x<8; x++){
01483             xy = y * stride + x;
01484             yz = y * 8 + x;
01485             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
01486         }
01487     }
01488 
01489     for(y=0; y<8; y++){
01490         src[  y*stride] = (temp[  y*8] + 2)>>2;
01491         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
01492         for(x=1; x<7; x++){
01493             xy = y * stride + x;
01494             yz = y * 8 + x;
01495             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
01496         }
01497     }
01498 }
01499 
01500 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01501 {
01502     int s, i;
01503 
01504     s = 0;
01505     for(i=0;i<h;i++) {
01506         s += abs(pix1[0] - pix2[0]);
01507         s += abs(pix1[1] - pix2[1]);
01508         s += abs(pix1[2] - pix2[2]);
01509         s += abs(pix1[3] - pix2[3]);
01510         s += abs(pix1[4] - pix2[4]);
01511         s += abs(pix1[5] - pix2[5]);
01512         s += abs(pix1[6] - pix2[6]);
01513         s += abs(pix1[7] - pix2[7]);
01514         s += abs(pix1[8] - pix2[8]);
01515         s += abs(pix1[9] - pix2[9]);
01516         s += abs(pix1[10] - pix2[10]);
01517         s += abs(pix1[11] - pix2[11]);
01518         s += abs(pix1[12] - pix2[12]);
01519         s += abs(pix1[13] - pix2[13]);
01520         s += abs(pix1[14] - pix2[14]);
01521         s += abs(pix1[15] - pix2[15]);
01522         pix1 += line_size;
01523         pix2 += line_size;
01524     }
01525     return s;
01526 }
01527 
01528 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01529 {
01530     int s, i;
01531 
01532     s = 0;
01533     for(i=0;i<h;i++) {
01534         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
01535         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
01536         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
01537         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
01538         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
01539         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
01540         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
01541         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
01542         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
01543         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
01544         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
01545         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
01546         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
01547         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
01548         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
01549         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
01550         pix1 += line_size;
01551         pix2 += line_size;
01552     }
01553     return s;
01554 }
01555 
01556 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01557 {
01558     int s, i;
01559     uint8_t *pix3 = pix2 + line_size;
01560 
01561     s = 0;
01562     for(i=0;i<h;i++) {
01563         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
01564         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
01565         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
01566         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
01567         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
01568         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
01569         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
01570         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
01571         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
01572         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
01573         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
01574         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
01575         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
01576         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
01577         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
01578         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
01579         pix1 += line_size;
01580         pix2 += line_size;
01581         pix3 += line_size;
01582     }
01583     return s;
01584 }
01585 
01586 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01587 {
01588     int s, i;
01589     uint8_t *pix3 = pix2 + line_size;
01590 
01591     s = 0;
01592     for(i=0;i<h;i++) {
01593         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
01594         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
01595         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
01596         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
01597         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
01598         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
01599         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
01600         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
01601         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
01602         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
01603         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
01604         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
01605         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
01606         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
01607         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
01608         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
01609         pix1 += line_size;
01610         pix2 += line_size;
01611         pix3 += line_size;
01612     }
01613     return s;
01614 }
01615 
01616 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01617 {
01618     int s, i;
01619 
01620     s = 0;
01621     for(i=0;i<h;i++) {
01622         s += abs(pix1[0] - pix2[0]);
01623         s += abs(pix1[1] - pix2[1]);
01624         s += abs(pix1[2] - pix2[2]);
01625         s += abs(pix1[3] - pix2[3]);
01626         s += abs(pix1[4] - pix2[4]);
01627         s += abs(pix1[5] - pix2[5]);
01628         s += abs(pix1[6] - pix2[6]);
01629         s += abs(pix1[7] - pix2[7]);
01630         pix1 += line_size;
01631         pix2 += line_size;
01632     }
01633     return s;
01634 }
01635 
01636 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01637 {
01638     int s, i;
01639 
01640     s = 0;
01641     for(i=0;i<h;i++) {
01642         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
01643         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
01644         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
01645         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
01646         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
01647         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
01648         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
01649         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
01650         pix1 += line_size;
01651         pix2 += line_size;
01652     }
01653     return s;
01654 }
01655 
01656 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01657 {
01658     int s, i;
01659     uint8_t *pix3 = pix2 + line_size;
01660 
01661     s = 0;
01662     for(i=0;i<h;i++) {
01663         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
01664         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
01665         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
01666         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
01667         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
01668         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
01669         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
01670         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
01671         pix1 += line_size;
01672         pix2 += line_size;
01673         pix3 += line_size;
01674     }
01675     return s;
01676 }
01677 
01678 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
01679 {
01680     int s, i;
01681     uint8_t *pix3 = pix2 + line_size;
01682 
01683     s = 0;
01684     for(i=0;i<h;i++) {
01685         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
01686         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
01687         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
01688         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
01689         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
01690         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
01691         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
01692         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
01693         pix1 += line_size;
01694         pix2 += line_size;
01695         pix3 += line_size;
01696     }
01697     return s;
01698 }
01699 
01700 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
01701     MpegEncContext *c = v;
01702     int score1=0;
01703     int score2=0;
01704     int x,y;
01705 
01706     for(y=0; y<h; y++){
01707         for(x=0; x<16; x++){
01708             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
01709         }
01710         if(y+1<h){
01711             for(x=0; x<15; x++){
01712                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
01713                              - s1[x+1] + s1[x+1+stride])
01714                         -FFABS(  s2[x  ] - s2[x  +stride]
01715                              - s2[x+1] + s2[x+1+stride]);
01716             }
01717         }
01718         s1+= stride;
01719         s2+= stride;
01720     }
01721 
01722     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01723     else  return score1 + FFABS(score2)*8;
01724 }
01725 
01726 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
01727     MpegEncContext *c = v;
01728     int score1=0;
01729     int score2=0;
01730     int x,y;
01731 
01732     for(y=0; y<h; y++){
01733         for(x=0; x<8; x++){
01734             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
01735         }
01736         if(y+1<h){
01737             for(x=0; x<7; x++){
01738                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
01739                              - s1[x+1] + s1[x+1+stride])
01740                         -FFABS(  s2[x  ] - s2[x  +stride]
01741                              - s2[x+1] + s2[x+1+stride]);
01742             }
01743         }
01744         s1+= stride;
01745         s2+= stride;
01746     }
01747 
01748     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01749     else  return score1 + FFABS(score2)*8;
01750 }
01751 
01752 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
01753     int i;
01754     unsigned int sum=0;
01755 
01756     for(i=0; i<8*8; i++){
01757         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
01758         int w= weight[i];
01759         b>>= RECON_SHIFT;
01760         assert(-512<b && b<512);
01761 
01762         sum += (w*b)*(w*b)>>4;
01763     }
01764     return sum>>2;
01765 }
01766 
01767 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
01768     int i;
01769 
01770     for(i=0; i<8*8; i++){
01771         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
01772     }
01773 }
01774 
01783 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
01784 {
01785     int i;
01786     DCTELEM temp[64];
01787 
01788     if(last<=0) return;
01789     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
01790 
01791     for(i=0; i<=last; i++){
01792         const int j= scantable[i];
01793         temp[j]= block[j];
01794         block[j]=0;
01795     }
01796 
01797     for(i=0; i<=last; i++){
01798         const int j= scantable[i];
01799         const int perm_j= permutation[j];
01800         block[perm_j]= temp[j];
01801     }
01802 }
01803 
01804 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
01805     return 0;
01806 }
01807 
01808 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
01809     int i;
01810 
01811     memset(cmp, 0, sizeof(void*)*6);
01812 
01813     for(i=0; i<6; i++){
01814         switch(type&0xFF){
01815         case FF_CMP_SAD:
01816             cmp[i]= c->sad[i];
01817             break;
01818         case FF_CMP_SATD:
01819             cmp[i]= c->hadamard8_diff[i];
01820             break;
01821         case FF_CMP_SSE:
01822             cmp[i]= c->sse[i];
01823             break;
01824         case FF_CMP_DCT:
01825             cmp[i]= c->dct_sad[i];
01826             break;
01827         case FF_CMP_DCT264:
01828             cmp[i]= c->dct264_sad[i];
01829             break;
01830         case FF_CMP_DCTMAX:
01831             cmp[i]= c->dct_max[i];
01832             break;
01833         case FF_CMP_PSNR:
01834             cmp[i]= c->quant_psnr[i];
01835             break;
01836         case FF_CMP_BIT:
01837             cmp[i]= c->bit[i];
01838             break;
01839         case FF_CMP_RD:
01840             cmp[i]= c->rd[i];
01841             break;
01842         case FF_CMP_VSAD:
01843             cmp[i]= c->vsad[i];
01844             break;
01845         case FF_CMP_VSSE:
01846             cmp[i]= c->vsse[i];
01847             break;
01848         case FF_CMP_ZERO:
01849             cmp[i]= zero_cmp;
01850             break;
01851         case FF_CMP_NSSE:
01852             cmp[i]= c->nsse[i];
01853             break;
01854 #if CONFIG_DWT
01855         case FF_CMP_W53:
01856             cmp[i]= c->w53[i];
01857             break;
01858         case FF_CMP_W97:
01859             cmp[i]= c->w97[i];
01860             break;
01861 #endif
01862         default:
01863             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
01864         }
01865     }
01866 }
01867 
01868 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
01869     long i;
01870     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
01871         long a = *(long*)(src+i);
01872         long b = *(long*)(dst+i);
01873         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
01874     }
01875     for(; i<w; i++)
01876         dst[i+0] += src[i+0];
01877 }
01878 
01879 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01880     long i;
01881     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
01882         long a = *(long*)(src1+i);
01883         long b = *(long*)(src2+i);
01884         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
01885     }
01886     for(; i<w; i++)
01887         dst[i] = src1[i]+src2[i];
01888 }
01889 
01890 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01891     long i;
01892 #if !HAVE_FAST_UNALIGNED
01893     if((long)src2 & (sizeof(long)-1)){
01894         for(i=0; i+7<w; i+=8){
01895             dst[i+0] = src1[i+0]-src2[i+0];
01896             dst[i+1] = src1[i+1]-src2[i+1];
01897             dst[i+2] = src1[i+2]-src2[i+2];
01898             dst[i+3] = src1[i+3]-src2[i+3];
01899             dst[i+4] = src1[i+4]-src2[i+4];
01900             dst[i+5] = src1[i+5]-src2[i+5];
01901             dst[i+6] = src1[i+6]-src2[i+6];
01902             dst[i+7] = src1[i+7]-src2[i+7];
01903         }
01904     }else
01905 #endif
01906     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
01907         long a = *(long*)(src1+i);
01908         long b = *(long*)(src2+i);
01909         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
01910     }
01911     for(; i<w; i++)
01912         dst[i+0] = src1[i+0]-src2[i+0];
01913 }
01914 
01915 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
01916     int i;
01917     uint8_t l, lt;
01918 
01919     l= *left;
01920     lt= *left_top;
01921 
01922     for(i=0; i<w; i++){
01923         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
01924         lt= src1[i];
01925         dst[i]= l;
01926     }
01927 
01928     *left= l;
01929     *left_top= lt;
01930 }
01931 
01932 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
01933     int i;
01934     uint8_t l, lt;
01935 
01936     l= *left;
01937     lt= *left_top;
01938 
01939     for(i=0; i<w; i++){
01940         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
01941         lt= src1[i];
01942         l= src2[i];
01943         dst[i]= l - pred;
01944     }
01945 
01946     *left= l;
01947     *left_top= lt;
01948 }
01949 
01950 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
01951     int i;
01952 
01953     for(i=0; i<w-1; i++){
01954         acc+= src[i];
01955         dst[i]= acc;
01956         i++;
01957         acc+= src[i];
01958         dst[i]= acc;
01959     }
01960 
01961     for(; i<w; i++){
01962         acc+= src[i];
01963         dst[i]= acc;
01964     }
01965 
01966     return acc;
01967 }
01968 
01969 #if HAVE_BIGENDIAN
01970 #define B 3
01971 #define G 2
01972 #define R 1
01973 #define A 0
01974 #else
01975 #define B 0
01976 #define G 1
01977 #define R 2
01978 #define A 3
01979 #endif
01980 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
01981     int i;
01982     int r,g,b,a;
01983     r= *red;
01984     g= *green;
01985     b= *blue;
01986     a= *alpha;
01987 
01988     for(i=0; i<w; i++){
01989         b+= src[4*i+B];
01990         g+= src[4*i+G];
01991         r+= src[4*i+R];
01992         a+= src[4*i+A];
01993 
01994         dst[4*i+B]= b;
01995         dst[4*i+G]= g;
01996         dst[4*i+R]= r;
01997         dst[4*i+A]= a;
01998     }
01999 
02000     *red= r;
02001     *green= g;
02002     *blue= b;
02003     *alpha= a;
02004 }
02005 #undef B
02006 #undef G
02007 #undef R
02008 #undef A
02009 
02010 #define BUTTERFLY2(o1,o2,i1,i2) \
02011 o1= (i1)+(i2);\
02012 o2= (i1)-(i2);
02013 
02014 #define BUTTERFLY1(x,y) \
02015 {\
02016     int a,b;\
02017     a= x;\
02018     b= y;\
02019     x= a+b;\
02020     y= a-b;\
02021 }
02022 
02023 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
02024 
02025 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
02026     int i;
02027     int temp[64];
02028     int sum=0;
02029 
02030     assert(h==8);
02031 
02032     for(i=0; i<8; i++){
02033         //FIXME try pointer walks
02034         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
02035         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
02036         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
02037         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
02038 
02039         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
02040         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
02041         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
02042         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
02043 
02044         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
02045         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
02046         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
02047         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
02048     }
02049 
02050     for(i=0; i<8; i++){
02051         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
02052         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
02053         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
02054         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
02055 
02056         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
02057         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
02058         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
02059         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
02060 
02061         sum +=
02062              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
02063             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
02064             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
02065             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
02066     }
02067     return sum;
02068 }
02069 
02070 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
02071     int i;
02072     int temp[64];
02073     int sum=0;
02074 
02075     assert(h==8);
02076 
02077     for(i=0; i<8; i++){
02078         //FIXME try pointer walks
02079         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
02080         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
02081         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
02082         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
02083 
02084         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
02085         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
02086         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
02087         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
02088 
02089         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
02090         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
02091         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
02092         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
02093     }
02094 
02095     for(i=0; i<8; i++){
02096         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
02097         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
02098         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
02099         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
02100 
02101         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
02102         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
02103         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
02104         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
02105 
02106         sum +=
02107              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
02108             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
02109             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
02110             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
02111     }
02112 
02113     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
02114 
02115     return sum;
02116 }
02117 
02118 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02119     MpegEncContext * const s= (MpegEncContext *)c;
02120     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02121 
02122     assert(h==8);
02123 
02124     s->dsp.diff_pixels(temp, src1, src2, stride);
02125     s->dsp.fdct(temp);
02126     return s->dsp.sum_abs_dctelem(temp);
02127 }
02128 
02129 #if CONFIG_GPL
02130 #define DCT8_1D {\
02131     const int s07 = SRC(0) + SRC(7);\
02132     const int s16 = SRC(1) + SRC(6);\
02133     const int s25 = SRC(2) + SRC(5);\
02134     const int s34 = SRC(3) + SRC(4);\
02135     const int a0 = s07 + s34;\
02136     const int a1 = s16 + s25;\
02137     const int a2 = s07 - s34;\
02138     const int a3 = s16 - s25;\
02139     const int d07 = SRC(0) - SRC(7);\
02140     const int d16 = SRC(1) - SRC(6);\
02141     const int d25 = SRC(2) - SRC(5);\
02142     const int d34 = SRC(3) - SRC(4);\
02143     const int a4 = d16 + d25 + (d07 + (d07>>1));\
02144     const int a5 = d07 - d34 - (d25 + (d25>>1));\
02145     const int a6 = d07 + d34 - (d16 + (d16>>1));\
02146     const int a7 = d16 - d25 + (d34 + (d34>>1));\
02147     DST(0,  a0 + a1     ) ;\
02148     DST(1,  a4 + (a7>>2)) ;\
02149     DST(2,  a2 + (a3>>1)) ;\
02150     DST(3,  a5 + (a6>>2)) ;\
02151     DST(4,  a0 - a1     ) ;\
02152     DST(5,  a6 - (a5>>2)) ;\
02153     DST(6, (a2>>1) - a3 ) ;\
02154     DST(7, (a4>>2) - a7 ) ;\
02155 }
02156 
02157 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02158     MpegEncContext * const s= (MpegEncContext *)c;
02159     DCTELEM dct[8][8];
02160     int i;
02161     int sum=0;
02162 
02163     s->dsp.diff_pixels(dct[0], src1, src2, stride);
02164 
02165 #define SRC(x) dct[i][x]
02166 #define DST(x,v) dct[i][x]= v
02167     for( i = 0; i < 8; i++ )
02168         DCT8_1D
02169 #undef SRC
02170 #undef DST
02171 
02172 #define SRC(x) dct[x][i]
02173 #define DST(x,v) sum += FFABS(v)
02174     for( i = 0; i < 8; i++ )
02175         DCT8_1D
02176 #undef SRC
02177 #undef DST
02178     return sum;
02179 }
02180 #endif
02181 
02182 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02183     MpegEncContext * const s= (MpegEncContext *)c;
02184     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02185     int sum=0, i;
02186 
02187     assert(h==8);
02188 
02189     s->dsp.diff_pixels(temp, src1, src2, stride);
02190     s->dsp.fdct(temp);
02191 
02192     for(i=0; i<64; i++)
02193         sum= FFMAX(sum, FFABS(temp[i]));
02194 
02195     return sum;
02196 }
02197 
02198 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02199     MpegEncContext * const s= (MpegEncContext *)c;
02200     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
02201     DCTELEM * const bak = temp+64;
02202     int sum=0, i;
02203 
02204     assert(h==8);
02205     s->mb_intra=0;
02206 
02207     s->dsp.diff_pixels(temp, src1, src2, stride);
02208 
02209     memcpy(bak, temp, 64*sizeof(DCTELEM));
02210 
02211     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
02212     s->dct_unquantize_inter(s, temp, 0, s->qscale);
02213     ff_simple_idct_8(temp); //FIXME
02214 
02215     for(i=0; i<64; i++)
02216         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
02217 
02218     return sum;
02219 }
02220 
02221 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02222     MpegEncContext * const s= (MpegEncContext *)c;
02223     const uint8_t *scantable= s->intra_scantable.permutated;
02224     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02225     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
02226     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
02227     int i, last, run, bits, level, distortion, start_i;
02228     const int esc_length= s->ac_esc_length;
02229     uint8_t * length;
02230     uint8_t * last_length;
02231 
02232     assert(h==8);
02233 
02234     copy_block8(lsrc1, src1, 8, stride, 8);
02235     copy_block8(lsrc2, src2, 8, stride, 8);
02236 
02237     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
02238 
02239     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
02240 
02241     bits=0;
02242 
02243     if (s->mb_intra) {
02244         start_i = 1;
02245         length     = s->intra_ac_vlc_length;
02246         last_length= s->intra_ac_vlc_last_length;
02247         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
02248     } else {
02249         start_i = 0;
02250         length     = s->inter_ac_vlc_length;
02251         last_length= s->inter_ac_vlc_last_length;
02252     }
02253 
02254     if(last>=start_i){
02255         run=0;
02256         for(i=start_i; i<last; i++){
02257             int j= scantable[i];
02258             level= temp[j];
02259 
02260             if(level){
02261                 level+=64;
02262                 if((level&(~127)) == 0){
02263                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
02264                 }else
02265                     bits+= esc_length;
02266                 run=0;
02267             }else
02268                 run++;
02269         }
02270         i= scantable[last];
02271 
02272         level= temp[i] + 64;
02273 
02274         assert(level - 64);
02275 
02276         if((level&(~127)) == 0){
02277             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
02278         }else
02279             bits+= esc_length;
02280 
02281     }
02282 
02283     if(last>=0){
02284         if(s->mb_intra)
02285             s->dct_unquantize_intra(s, temp, 0, s->qscale);
02286         else
02287             s->dct_unquantize_inter(s, temp, 0, s->qscale);
02288     }
02289 
02290     s->dsp.idct_add(lsrc2, 8, temp);
02291 
02292     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
02293 
02294     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
02295 }
02296 
02297 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
02298     MpegEncContext * const s= (MpegEncContext *)c;
02299     const uint8_t *scantable= s->intra_scantable.permutated;
02300     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
02301     int i, last, run, bits, level, start_i;
02302     const int esc_length= s->ac_esc_length;
02303     uint8_t * length;
02304     uint8_t * last_length;
02305 
02306     assert(h==8);
02307 
02308     s->dsp.diff_pixels(temp, src1, src2, stride);
02309 
02310     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
02311 
02312     bits=0;
02313 
02314     if (s->mb_intra) {
02315         start_i = 1;
02316         length     = s->intra_ac_vlc_length;
02317         last_length= s->intra_ac_vlc_last_length;
02318         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
02319     } else {
02320         start_i = 0;
02321         length     = s->inter_ac_vlc_length;
02322         last_length= s->inter_ac_vlc_last_length;
02323     }
02324 
02325     if(last>=start_i){
02326         run=0;
02327         for(i=start_i; i<last; i++){
02328             int j= scantable[i];
02329             level= temp[j];
02330 
02331             if(level){
02332                 level+=64;
02333                 if((level&(~127)) == 0){
02334                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
02335                 }else
02336                     bits+= esc_length;
02337                 run=0;
02338             }else
02339                 run++;
02340         }
02341         i= scantable[last];
02342 
02343         level= temp[i] + 64;
02344 
02345         assert(level - 64);
02346 
02347         if((level&(~127)) == 0){
02348             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
02349         }else
02350             bits+= esc_length;
02351     }
02352 
02353     return bits;
02354 }
02355 
02356 #define VSAD_INTRA(size) \
02357 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
02358     int score=0;                                                                                            \
02359     int x,y;                                                                                                \
02360                                                                                                             \
02361     for(y=1; y<h; y++){                                                                                     \
02362         for(x=0; x<size; x+=4){                                                                             \
02363             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
02364                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
02365         }                                                                                                   \
02366         s+= stride;                                                                                         \
02367     }                                                                                                       \
02368                                                                                                             \
02369     return score;                                                                                           \
02370 }
02371 VSAD_INTRA(8)
02372 VSAD_INTRA(16)
02373 
02374 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
02375     int score=0;
02376     int x,y;
02377 
02378     for(y=1; y<h; y++){
02379         for(x=0; x<16; x++){
02380             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
02381         }
02382         s1+= stride;
02383         s2+= stride;
02384     }
02385 
02386     return score;
02387 }
02388 
02389 #define SQ(a) ((a)*(a))
02390 #define VSSE_INTRA(size) \
02391 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
02392     int score=0;                                                                                            \
02393     int x,y;                                                                                                \
02394                                                                                                             \
02395     for(y=1; y<h; y++){                                                                                     \
02396         for(x=0; x<size; x+=4){                                                                               \
02397             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
02398                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
02399         }                                                                                                   \
02400         s+= stride;                                                                                         \
02401     }                                                                                                       \
02402                                                                                                             \
02403     return score;                                                                                           \
02404 }
02405 VSSE_INTRA(8)
02406 VSSE_INTRA(16)
02407 
02408 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
02409     int score=0;
02410     int x,y;
02411 
02412     for(y=1; y<h; y++){
02413         for(x=0; x<16; x++){
02414             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
02415         }
02416         s1+= stride;
02417         s2+= stride;
02418     }
02419 
02420     return score;
02421 }
02422 
02423 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
02424                                int size){
02425     int score=0;
02426     int i;
02427     for(i=0; i<size; i++)
02428         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
02429     return score;
02430 }
02431 
02432 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
02433 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
02434 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
02435 #if CONFIG_GPL
02436 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
02437 #endif
02438 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
02439 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
02440 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
02441 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
02442 
02443 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
02444     int i;
02445     for(i=0; i<len; i++)
02446         dst[i] = src0[i] * src1[i];
02447 }
02448 
02449 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
02450     int i;
02451     src1 += len-1;
02452     for(i=0; i<len; i++)
02453         dst[i] = src0[i] * src1[-i];
02454 }
02455 
02456 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
02457     int i;
02458     for(i=0; i<len; i++)
02459         dst[i] = src0[i] * src1[i] + src2[i];
02460 }
02461 
02462 static void vector_fmul_window_c(float *dst, const float *src0,
02463                                  const float *src1, const float *win, int len)
02464 {
02465     int i,j;
02466     dst += len;
02467     win += len;
02468     src0+= len;
02469     for(i=-len, j=len-1; i<0; i++, j--) {
02470         float s0 = src0[i];
02471         float s1 = src1[j];
02472         float wi = win[i];
02473         float wj = win[j];
02474         dst[i] = s0*wj - s1*wi;
02475         dst[j] = s0*wi + s1*wj;
02476     }
02477 }
02478 
02479 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
02480                                  int len)
02481 {
02482     int i;
02483     for (i = 0; i < len; i++)
02484         dst[i] = src[i] * mul;
02485 }
02486 
02487 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
02488                                  int len)
02489 {
02490     int i;
02491     for (i = 0; i < len; i++)
02492         dst[i] += src[i] * mul;
02493 }
02494 
02495 static void butterflies_float_c(float *restrict v1, float *restrict v2,
02496                                 int len)
02497 {
02498     int i;
02499     for (i = 0; i < len; i++) {
02500         float t = v1[i] - v2[i];
02501         v1[i] += v2[i];
02502         v2[i] = t;
02503     }
02504 }
02505 
02506 static void butterflies_float_interleave_c(float *dst, const float *src0,
02507                                            const float *src1, int len)
02508 {
02509     int i;
02510     for (i = 0; i < len; i++) {
02511         float f1 = src0[i];
02512         float f2 = src1[i];
02513         dst[2*i    ] = f1 + f2;
02514         dst[2*i + 1] = f1 - f2;
02515     }
02516 }
02517 
02518 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
02519 {
02520     float p = 0.0;
02521     int i;
02522 
02523     for (i = 0; i < len; i++)
02524         p += v1[i] * v2[i];
02525 
02526     return p;
02527 }
02528 
02529 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
02530                    uint32_t maxi, uint32_t maxisign)
02531 {
02532 
02533     if(a > mini) return mini;
02534     else if((a^(1U<<31)) > maxisign) return maxi;
02535     else return a;
02536 }
02537 
02538 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
02539     int i;
02540     uint32_t mini = *(uint32_t*)min;
02541     uint32_t maxi = *(uint32_t*)max;
02542     uint32_t maxisign = maxi ^ (1U<<31);
02543     uint32_t *dsti = (uint32_t*)dst;
02544     const uint32_t *srci = (const uint32_t*)src;
02545     for(i=0; i<len; i+=8) {
02546         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
02547         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
02548         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
02549         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
02550         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
02551         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
02552         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
02553         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
02554     }
02555 }
02556 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
02557     int i;
02558     if(min < 0 && max > 0) {
02559         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
02560     } else {
02561         for(i=0; i < len; i+=8) {
02562             dst[i    ] = av_clipf(src[i    ], min, max);
02563             dst[i + 1] = av_clipf(src[i + 1], min, max);
02564             dst[i + 2] = av_clipf(src[i + 2], min, max);
02565             dst[i + 3] = av_clipf(src[i + 3], min, max);
02566             dst[i + 4] = av_clipf(src[i + 4], min, max);
02567             dst[i + 5] = av_clipf(src[i + 5], min, max);
02568             dst[i + 6] = av_clipf(src[i + 6], min, max);
02569             dst[i + 7] = av_clipf(src[i + 7], min, max);
02570         }
02571     }
02572 }
02573 
02574 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
02575 {
02576     int res = 0;
02577 
02578     while (order--)
02579         res += (*v1++ * *v2++) >> shift;
02580 
02581     return res;
02582 }
02583 
02584 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
02585 {
02586     int res = 0;
02587     while (order--) {
02588         res   += *v1 * *v2++;
02589         *v1++ += mul * *v3++;
02590     }
02591     return res;
02592 }
02593 
02594 static void apply_window_int16_c(int16_t *output, const int16_t *input,
02595                                  const int16_t *window, unsigned int len)
02596 {
02597     int i;
02598     int len2 = len >> 1;
02599 
02600     for (i = 0; i < len2; i++) {
02601         int16_t w       = window[i];
02602         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
02603         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
02604     }
02605 }
02606 
02607 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
02608                                 int32_t max, unsigned int len)
02609 {
02610     do {
02611         *dst++ = av_clip(*src++, min, max);
02612         *dst++ = av_clip(*src++, min, max);
02613         *dst++ = av_clip(*src++, min, max);
02614         *dst++ = av_clip(*src++, min, max);
02615         *dst++ = av_clip(*src++, min, max);
02616         *dst++ = av_clip(*src++, min, max);
02617         *dst++ = av_clip(*src++, min, max);
02618         *dst++ = av_clip(*src++, min, max);
02619         len -= 8;
02620     } while (len > 0);
02621 }
02622 
02623 #define W0 2048
02624 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
02625 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
02626 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
02627 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
02628 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
02629 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
02630 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
02631 
02632 static void wmv2_idct_row(short * b)
02633 {
02634     int s1,s2;
02635     int a0,a1,a2,a3,a4,a5,a6,a7;
02636     /*step 1*/
02637     a1 = W1*b[1]+W7*b[7];
02638     a7 = W7*b[1]-W1*b[7];
02639     a5 = W5*b[5]+W3*b[3];
02640     a3 = W3*b[5]-W5*b[3];
02641     a2 = W2*b[2]+W6*b[6];
02642     a6 = W6*b[2]-W2*b[6];
02643     a0 = W0*b[0]+W0*b[4];
02644     a4 = W0*b[0]-W0*b[4];
02645     /*step 2*/
02646     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
02647     s2 = (181*(a1-a5-a7+a3)+128)>>8;
02648     /*step 3*/
02649     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
02650     b[1] = (a4+a6 +s1   + (1<<7))>>8;
02651     b[2] = (a4-a6 +s2   + (1<<7))>>8;
02652     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
02653     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
02654     b[5] = (a4-a6 -s2   + (1<<7))>>8;
02655     b[6] = (a4+a6 -s1   + (1<<7))>>8;
02656     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
02657 }
02658 static void wmv2_idct_col(short * b)
02659 {
02660     int s1,s2;
02661     int a0,a1,a2,a3,a4,a5,a6,a7;
02662     /*step 1, with extended precision*/
02663     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
02664     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
02665     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
02666     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
02667     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
02668     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
02669     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
02670     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
02671     /*step 2*/
02672     s1 = (181*(a1-a5+a7-a3)+128)>>8;
02673     s2 = (181*(a1-a5-a7+a3)+128)>>8;
02674     /*step 3*/
02675     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
02676     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
02677     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
02678     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
02679 
02680     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
02681     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
02682     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
02683     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
02684 }
02685 void ff_wmv2_idct_c(short * block){
02686     int i;
02687 
02688     for(i=0;i<64;i+=8){
02689         wmv2_idct_row(block+i);
02690     }
02691     for(i=0;i<8;i++){
02692         wmv2_idct_col(block+i);
02693     }
02694 }
02695 /* XXX: those functions should be suppressed ASAP when all IDCTs are
02696  converted */
02697 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
02698 {
02699     ff_wmv2_idct_c(block);
02700     ff_put_pixels_clamped_c(block, dest, line_size);
02701 }
02702 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
02703 {
02704     ff_wmv2_idct_c(block);
02705     ff_add_pixels_clamped_c(block, dest, line_size);
02706 }
02707 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02708 {
02709     j_rev_dct (block);
02710     ff_put_pixels_clamped_c(block, dest, line_size);
02711 }
02712 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02713 {
02714     j_rev_dct (block);
02715     ff_add_pixels_clamped_c(block, dest, line_size);
02716 }
02717 
02718 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
02719 {
02720     j_rev_dct4 (block);
02721     put_pixels_clamped4_c(block, dest, line_size);
02722 }
02723 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
02724 {
02725     j_rev_dct4 (block);
02726     add_pixels_clamped4_c(block, dest, line_size);
02727 }
02728 
02729 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
02730 {
02731     j_rev_dct2 (block);
02732     put_pixels_clamped2_c(block, dest, line_size);
02733 }
02734 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
02735 {
02736     j_rev_dct2 (block);
02737     add_pixels_clamped2_c(block, dest, line_size);
02738 }
02739 
02740 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
02741 {
02742     dest[0] = av_clip_uint8((block[0] + 4)>>3);
02743 }
02744 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
02745 {
02746     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
02747 }
02748 
02749 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
02750 
02751 /* init static data */
02752 av_cold void dsputil_static_init(void)
02753 {
02754     int i;
02755 
02756     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
02757     for(i=0;i<MAX_NEG_CROP;i++) {
02758         ff_cropTbl[i] = 0;
02759         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
02760     }
02761 
02762     for(i=0;i<512;i++) {
02763         ff_squareTbl[i] = (i - 256) * (i - 256);
02764     }
02765 
02766     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
02767 }
02768 
02769 int ff_check_alignment(void){
02770     static int did_fail=0;
02771     LOCAL_ALIGNED_16(int, aligned, [4]);
02772 
02773     if((intptr_t)aligned & 15){
02774         if(!did_fail){
02775 #if HAVE_MMX || HAVE_ALTIVEC
02776             av_log(NULL, AV_LOG_ERROR,
02777                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
02778                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
02779                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
02780                 "Do not report crashes to Libav developers.\n");
02781 #endif
02782             did_fail=1;
02783         }
02784         return -1;
02785     }
02786     return 0;
02787 }
02788 
02789 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
02790 {
02791     int i, j;
02792 
02793     ff_check_alignment();
02794 
02795 #if CONFIG_ENCODERS
02796     if (avctx->bits_per_raw_sample == 10) {
02797         c->fdct    = ff_jpeg_fdct_islow_10;
02798         c->fdct248 = ff_fdct248_islow_10;
02799     } else {
02800         if(avctx->dct_algo==FF_DCT_FASTINT) {
02801             c->fdct    = fdct_ifast;
02802             c->fdct248 = fdct_ifast248;
02803         }
02804         else if(avctx->dct_algo==FF_DCT_FAAN) {
02805             c->fdct    = ff_faandct;
02806             c->fdct248 = ff_faandct248;
02807         }
02808         else {
02809             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
02810             c->fdct248 = ff_fdct248_islow_8;
02811         }
02812     }
02813 #endif //CONFIG_ENCODERS
02814 
02815     if(avctx->lowres==1){
02816         c->idct_put= ff_jref_idct4_put;
02817         c->idct_add= ff_jref_idct4_add;
02818         c->idct    = j_rev_dct4;
02819         c->idct_permutation_type= FF_NO_IDCT_PERM;
02820     }else if(avctx->lowres==2){
02821         c->idct_put= ff_jref_idct2_put;
02822         c->idct_add= ff_jref_idct2_add;
02823         c->idct    = j_rev_dct2;
02824         c->idct_permutation_type= FF_NO_IDCT_PERM;
02825     }else if(avctx->lowres==3){
02826         c->idct_put= ff_jref_idct1_put;
02827         c->idct_add= ff_jref_idct1_add;
02828         c->idct    = j_rev_dct1;
02829         c->idct_permutation_type= FF_NO_IDCT_PERM;
02830     }else{
02831         if (avctx->bits_per_raw_sample == 10) {
02832             c->idct_put              = ff_simple_idct_put_10;
02833             c->idct_add              = ff_simple_idct_add_10;
02834             c->idct                  = ff_simple_idct_10;
02835             c->idct_permutation_type = FF_NO_IDCT_PERM;
02836         } else {
02837         if(avctx->idct_algo==FF_IDCT_INT){
02838             c->idct_put= ff_jref_idct_put;
02839             c->idct_add= ff_jref_idct_add;
02840             c->idct    = j_rev_dct;
02841             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
02842         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
02843                 avctx->idct_algo==FF_IDCT_VP3){
02844             c->idct_put= ff_vp3_idct_put_c;
02845             c->idct_add= ff_vp3_idct_add_c;
02846             c->idct    = ff_vp3_idct_c;
02847             c->idct_permutation_type= FF_NO_IDCT_PERM;
02848         }else if(avctx->idct_algo==FF_IDCT_WMV2){
02849             c->idct_put= ff_wmv2_idct_put_c;
02850             c->idct_add= ff_wmv2_idct_add_c;
02851             c->idct    = ff_wmv2_idct_c;
02852             c->idct_permutation_type= FF_NO_IDCT_PERM;
02853         }else if(avctx->idct_algo==FF_IDCT_FAAN){
02854             c->idct_put= ff_faanidct_put;
02855             c->idct_add= ff_faanidct_add;
02856             c->idct    = ff_faanidct;
02857             c->idct_permutation_type= FF_NO_IDCT_PERM;
02858         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
02859             c->idct_put= ff_ea_idct_put_c;
02860             c->idct_permutation_type= FF_NO_IDCT_PERM;
02861         }else{ //accurate/default
02862             c->idct_put = ff_simple_idct_put_8;
02863             c->idct_add = ff_simple_idct_add_8;
02864             c->idct     = ff_simple_idct_8;
02865             c->idct_permutation_type= FF_NO_IDCT_PERM;
02866         }
02867         }
02868     }
02869 
02870     c->diff_pixels = diff_pixels_c;
02871     c->put_pixels_clamped = ff_put_pixels_clamped_c;
02872     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
02873     c->add_pixels_clamped = ff_add_pixels_clamped_c;
02874     c->sum_abs_dctelem = sum_abs_dctelem_c;
02875     c->gmc1 = gmc1_c;
02876     c->gmc = ff_gmc_c;
02877     c->pix_sum = pix_sum_c;
02878     c->pix_norm1 = pix_norm1_c;
02879 
02880     c->fill_block_tab[0] = fill_block16_c;
02881     c->fill_block_tab[1] = fill_block8_c;
02882 
02883     /* TODO [0] 16  [1] 8 */
02884     c->pix_abs[0][0] = pix_abs16_c;
02885     c->pix_abs[0][1] = pix_abs16_x2_c;
02886     c->pix_abs[0][2] = pix_abs16_y2_c;
02887     c->pix_abs[0][3] = pix_abs16_xy2_c;
02888     c->pix_abs[1][0] = pix_abs8_c;
02889     c->pix_abs[1][1] = pix_abs8_x2_c;
02890     c->pix_abs[1][2] = pix_abs8_y2_c;
02891     c->pix_abs[1][3] = pix_abs8_xy2_c;
02892 
02893     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
02894     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
02895     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
02896     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
02897     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
02898     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
02899     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
02900     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
02901     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
02902 
02903     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
02904     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
02905     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
02906     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
02907     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
02908     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
02909     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
02910     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
02911     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
02912 
02913 #define dspfunc(PFX, IDX, NUM) \
02914     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
02915     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
02916     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
02917     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
02918     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
02919     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
02920     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
02921     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
02922     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
02923     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
02924     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
02925     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
02926     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
02927     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
02928     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
02929     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
02930 
02931     dspfunc(put_qpel, 0, 16);
02932     dspfunc(put_no_rnd_qpel, 0, 16);
02933 
02934     dspfunc(avg_qpel, 0, 16);
02935     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
02936 
02937     dspfunc(put_qpel, 1, 8);
02938     dspfunc(put_no_rnd_qpel, 1, 8);
02939 
02940     dspfunc(avg_qpel, 1, 8);
02941     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
02942 
02943 #undef dspfunc
02944 
02945 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
02946     ff_mlp_init(c, avctx);
02947 #endif
02948 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
02949     ff_intrax8dsp_init(c,avctx);
02950 #endif
02951 
02952     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
02953     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
02954     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
02955     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
02956     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
02957     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
02958     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
02959     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
02960 
02961 #define SET_CMP_FUNC(name) \
02962     c->name[0]= name ## 16_c;\
02963     c->name[1]= name ## 8x8_c;
02964 
02965     SET_CMP_FUNC(hadamard8_diff)
02966     c->hadamard8_diff[4]= hadamard8_intra16_c;
02967     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
02968     SET_CMP_FUNC(dct_sad)
02969     SET_CMP_FUNC(dct_max)
02970 #if CONFIG_GPL
02971     SET_CMP_FUNC(dct264_sad)
02972 #endif
02973     c->sad[0]= pix_abs16_c;
02974     c->sad[1]= pix_abs8_c;
02975     c->sse[0]= sse16_c;
02976     c->sse[1]= sse8_c;
02977     c->sse[2]= sse4_c;
02978     SET_CMP_FUNC(quant_psnr)
02979     SET_CMP_FUNC(rd)
02980     SET_CMP_FUNC(bit)
02981     c->vsad[0]= vsad16_c;
02982     c->vsad[4]= vsad_intra16_c;
02983     c->vsad[5]= vsad_intra8_c;
02984     c->vsse[0]= vsse16_c;
02985     c->vsse[4]= vsse_intra16_c;
02986     c->vsse[5]= vsse_intra8_c;
02987     c->nsse[0]= nsse16_c;
02988     c->nsse[1]= nsse8_c;
02989 #if CONFIG_DWT
02990     ff_dsputil_init_dwt(c);
02991 #endif
02992 
02993     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
02994 
02995     c->add_bytes= add_bytes_c;
02996     c->add_bytes_l2= add_bytes_l2_c;
02997     c->diff_bytes= diff_bytes_c;
02998     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
02999     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
03000     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
03001     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
03002     c->bswap_buf= bswap_buf;
03003     c->bswap16_buf = bswap16_buf;
03004 #if CONFIG_PNG_DECODER
03005     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
03006 #endif
03007 
03008     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
03009         c->h263_h_loop_filter= h263_h_loop_filter_c;
03010         c->h263_v_loop_filter= h263_v_loop_filter_c;
03011     }
03012 
03013     if (CONFIG_VP3_DECODER) {
03014         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
03015         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
03016         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
03017     }
03018 
03019     c->h261_loop_filter= h261_loop_filter_c;
03020 
03021     c->try_8x8basis= try_8x8basis_c;
03022     c->add_8x8basis= add_8x8basis_c;
03023 
03024 #if CONFIG_VORBIS_DECODER
03025     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
03026 #endif
03027 #if CONFIG_AC3_DECODER
03028     c->ac3_downmix = ff_ac3_downmix_c;
03029 #endif
03030     c->vector_fmul = vector_fmul_c;
03031     c->vector_fmul_reverse = vector_fmul_reverse_c;
03032     c->vector_fmul_add = vector_fmul_add_c;
03033     c->vector_fmul_window = vector_fmul_window_c;
03034     c->vector_clipf = vector_clipf_c;
03035     c->scalarproduct_int16 = scalarproduct_int16_c;
03036     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
03037     c->apply_window_int16 = apply_window_int16_c;
03038     c->vector_clip_int32 = vector_clip_int32_c;
03039     c->scalarproduct_float = scalarproduct_float_c;
03040     c->butterflies_float = butterflies_float_c;
03041     c->butterflies_float_interleave = butterflies_float_interleave_c;
03042     c->vector_fmul_scalar = vector_fmul_scalar_c;
03043     c->vector_fmac_scalar = vector_fmac_scalar_c;
03044 
03045     c->shrink[0]= av_image_copy_plane;
03046     c->shrink[1]= ff_shrink22;
03047     c->shrink[2]= ff_shrink44;
03048     c->shrink[3]= ff_shrink88;
03049 
03050     c->prefetch= just_return;
03051 
03052     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
03053     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
03054 
03055 #undef FUNC
03056 #undef FUNCC
03057 #define FUNC(f, depth) f ## _ ## depth
03058 #define FUNCC(f, depth) f ## _ ## depth ## _c
03059 
03060 #define dspfunc1(PFX, IDX, NUM, depth)\
03061     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
03062     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
03063     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
03064     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
03065 
03066 #define dspfunc2(PFX, IDX, NUM, depth)\
03067     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
03068     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
03069     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
03070     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
03071     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
03072     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
03073     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
03074     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
03075     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
03076     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
03077     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
03078     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
03079     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
03080     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
03081     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
03082     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
03083 
03084 
03085 #define BIT_DEPTH_FUNCS(depth, dct)\
03086     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
03087     c->draw_edges                    = FUNCC(draw_edges            , depth);\
03088     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
03089     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
03090     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
03091     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
03092     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
03093     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
03094     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
03095 \
03096     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
03097     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
03098     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
03099     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
03100     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
03101     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
03102 \
03103     dspfunc1(put       , 0, 16, depth);\
03104     dspfunc1(put       , 1,  8, depth);\
03105     dspfunc1(put       , 2,  4, depth);\
03106     dspfunc1(put       , 3,  2, depth);\
03107     dspfunc1(put_no_rnd, 0, 16, depth);\
03108     dspfunc1(put_no_rnd, 1,  8, depth);\
03109     dspfunc1(avg       , 0, 16, depth);\
03110     dspfunc1(avg       , 1,  8, depth);\
03111     dspfunc1(avg       , 2,  4, depth);\
03112     dspfunc1(avg       , 3,  2, depth);\
03113     dspfunc1(avg_no_rnd, 0, 16, depth);\
03114     dspfunc1(avg_no_rnd, 1,  8, depth);\
03115 \
03116     dspfunc2(put_h264_qpel, 0, 16, depth);\
03117     dspfunc2(put_h264_qpel, 1,  8, depth);\
03118     dspfunc2(put_h264_qpel, 2,  4, depth);\
03119     dspfunc2(put_h264_qpel, 3,  2, depth);\
03120     dspfunc2(avg_h264_qpel, 0, 16, depth);\
03121     dspfunc2(avg_h264_qpel, 1,  8, depth);\
03122     dspfunc2(avg_h264_qpel, 2,  4, depth);
03123 
03124     switch (avctx->bits_per_raw_sample) {
03125     case 9:
03126         if (c->dct_bits == 32) {
03127             BIT_DEPTH_FUNCS(9, _32);
03128         } else {
03129             BIT_DEPTH_FUNCS(9, _16);
03130         }
03131         break;
03132     case 10:
03133         if (c->dct_bits == 32) {
03134             BIT_DEPTH_FUNCS(10, _32);
03135         } else {
03136             BIT_DEPTH_FUNCS(10, _16);
03137         }
03138         break;
03139     default:
03140         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
03141     case 8:
03142         BIT_DEPTH_FUNCS(8, _16);
03143         break;
03144     }
03145 
03146 
03147     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
03148     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
03149     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
03150     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
03151     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
03152     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
03153     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
03154     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
03155     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
03156 
03157     for (i = 0; i < 4; i++) {
03158         for (j = 0; j < 16; j++) {
03159             if(!c->put_2tap_qpel_pixels_tab[i][j])
03160                 c->put_2tap_qpel_pixels_tab[i][j] =
03161                     c->put_h264_qpel_pixels_tab[i][j];
03162             if(!c->avg_2tap_qpel_pixels_tab[i][j])
03163                 c->avg_2tap_qpel_pixels_tab[i][j] =
03164                     c->avg_h264_qpel_pixels_tab[i][j];
03165         }
03166     }
03167 
03168     ff_init_scantable_permutation(c->idct_permutation,
03169                                   c->idct_permutation_type);
03170 }