• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/h264dsp_mmx.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #include "dsputil_mmx.h"
00022 
00023 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
00024 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
00025 
00026 /***********************************/
00027 /* IDCT */
00028 
00029 #define SUMSUB_BADC( a, b, c, d ) \
00030     "paddw "#b", "#a" \n\t"\
00031     "paddw "#d", "#c" \n\t"\
00032     "paddw "#b", "#b" \n\t"\
00033     "paddw "#d", "#d" \n\t"\
00034     "psubw "#a", "#b" \n\t"\
00035     "psubw "#c", "#d" \n\t"
00036 
00037 #define SUMSUBD2_AB( a, b, t ) \
00038     "movq  "#b", "#t" \n\t"\
00039     "psraw  $1 , "#b" \n\t"\
00040     "paddw "#a", "#b" \n\t"\
00041     "psraw  $1 , "#a" \n\t"\
00042     "psubw "#t", "#a" \n\t"
00043 
00044 #define IDCT4_1D( s02, s13, d02, d13, t ) \
00045     SUMSUB_BA  ( s02, d02 )\
00046     SUMSUBD2_AB( s13, d13, t )\
00047     SUMSUB_BADC( d13, s02, s13, d02 )
00048 
00049 #define STORE_DIFF_4P( p, t, z ) \
00050     "psraw      $6,     "#p" \n\t"\
00051     "movd       (%0),   "#t" \n\t"\
00052     "punpcklbw "#z",    "#t" \n\t"\
00053     "paddsw    "#t",    "#p" \n\t"\
00054     "packuswb  "#z",    "#p" \n\t"\
00055     "movd      "#p",    (%0) \n\t"
00056 
00057 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
00058 {
00059     /* Load dct coeffs */
00060     __asm__ volatile(
00061         "movq   (%0), %%mm0 \n\t"
00062         "movq  8(%0), %%mm1 \n\t"
00063         "movq 16(%0), %%mm2 \n\t"
00064         "movq 24(%0), %%mm3 \n\t"
00065     :: "r"(block) );
00066 
00067     __asm__ volatile(
00068         /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
00069         IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
00070 
00071         "movq      %0,    %%mm6 \n\t"
00072         /* in: 1,4,0,2  out: 1,2,3,0 */
00073         TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
00074 
00075         "paddw     %%mm6, %%mm3 \n\t"
00076 
00077         /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
00078         IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
00079 
00080         "pxor %%mm7, %%mm7    \n\t"
00081     :: "m"(ff_pw_32));
00082 
00083     __asm__ volatile(
00084     STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
00085         "add %1, %0             \n\t"
00086     STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
00087         "add %1, %0             \n\t"
00088     STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
00089         "add %1, %0             \n\t"
00090     STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
00091         : "+r"(dst)
00092         : "r" ((x86_reg)stride)
00093     );
00094 }
00095 
00096 static inline void h264_idct8_1d(int16_t *block)
00097 {
00098     __asm__ volatile(
00099         "movq 112(%0), %%mm7  \n\t"
00100         "movq  80(%0), %%mm0  \n\t"
00101         "movq  48(%0), %%mm3  \n\t"
00102         "movq  16(%0), %%mm5  \n\t"
00103 
00104         "movq   %%mm0, %%mm4  \n\t"
00105         "movq   %%mm5, %%mm1  \n\t"
00106         "psraw  $1,    %%mm4  \n\t"
00107         "psraw  $1,    %%mm1  \n\t"
00108         "paddw  %%mm0, %%mm4  \n\t"
00109         "paddw  %%mm5, %%mm1  \n\t"
00110         "paddw  %%mm7, %%mm4  \n\t"
00111         "paddw  %%mm0, %%mm1  \n\t"
00112         "psubw  %%mm5, %%mm4  \n\t"
00113         "paddw  %%mm3, %%mm1  \n\t"
00114 
00115         "psubw  %%mm3, %%mm5  \n\t"
00116         "psubw  %%mm3, %%mm0  \n\t"
00117         "paddw  %%mm7, %%mm5  \n\t"
00118         "psubw  %%mm7, %%mm0  \n\t"
00119         "psraw  $1,    %%mm3  \n\t"
00120         "psraw  $1,    %%mm7  \n\t"
00121         "psubw  %%mm3, %%mm5  \n\t"
00122         "psubw  %%mm7, %%mm0  \n\t"
00123 
00124         "movq   %%mm4, %%mm3  \n\t"
00125         "movq   %%mm1, %%mm7  \n\t"
00126         "psraw  $2,    %%mm1  \n\t"
00127         "psraw  $2,    %%mm3  \n\t"
00128         "paddw  %%mm5, %%mm3  \n\t"
00129         "psraw  $2,    %%mm5  \n\t"
00130         "paddw  %%mm0, %%mm1  \n\t"
00131         "psraw  $2,    %%mm0  \n\t"
00132         "psubw  %%mm4, %%mm5  \n\t"
00133         "psubw  %%mm0, %%mm7  \n\t"
00134 
00135         "movq  32(%0), %%mm2  \n\t"
00136         "movq  96(%0), %%mm6  \n\t"
00137         "movq   %%mm2, %%mm4  \n\t"
00138         "movq   %%mm6, %%mm0  \n\t"
00139         "psraw  $1,    %%mm4  \n\t"
00140         "psraw  $1,    %%mm6  \n\t"
00141         "psubw  %%mm0, %%mm4  \n\t"
00142         "paddw  %%mm2, %%mm6  \n\t"
00143 
00144         "movq    (%0), %%mm2  \n\t"
00145         "movq  64(%0), %%mm0  \n\t"
00146         SUMSUB_BA( %%mm0, %%mm2 )
00147         SUMSUB_BA( %%mm6, %%mm0 )
00148         SUMSUB_BA( %%mm4, %%mm2 )
00149         SUMSUB_BA( %%mm7, %%mm6 )
00150         SUMSUB_BA( %%mm5, %%mm4 )
00151         SUMSUB_BA( %%mm3, %%mm2 )
00152         SUMSUB_BA( %%mm1, %%mm0 )
00153         :: "r"(block)
00154     );
00155 }
00156 
00157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
00158 {
00159     int i;
00160     DECLARE_ALIGNED(8, int16_t, b2)[64];
00161 
00162     block[0] += 32;
00163 
00164     for(i=0; i<2; i++){
00165         DECLARE_ALIGNED(8, uint64_t, tmp);
00166 
00167         h264_idct8_1d(block+4*i);
00168 
00169         __asm__ volatile(
00170             "movq   %%mm7,    %0   \n\t"
00171             TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
00172             "movq   %%mm0,  8(%1)  \n\t"
00173             "movq   %%mm6, 24(%1)  \n\t"
00174             "movq   %%mm7, 40(%1)  \n\t"
00175             "movq   %%mm4, 56(%1)  \n\t"
00176             "movq    %0,    %%mm7  \n\t"
00177             TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
00178             "movq   %%mm7,   (%1)  \n\t"
00179             "movq   %%mm1, 16(%1)  \n\t"
00180             "movq   %%mm0, 32(%1)  \n\t"
00181             "movq   %%mm3, 48(%1)  \n\t"
00182             : "=m"(tmp)
00183             : "r"(b2+32*i)
00184             : "memory"
00185         );
00186     }
00187 
00188     for(i=0; i<2; i++){
00189         h264_idct8_1d(b2+4*i);
00190 
00191         __asm__ volatile(
00192             "psraw     $6, %%mm7  \n\t"
00193             "psraw     $6, %%mm6  \n\t"
00194             "psraw     $6, %%mm5  \n\t"
00195             "psraw     $6, %%mm4  \n\t"
00196             "psraw     $6, %%mm3  \n\t"
00197             "psraw     $6, %%mm2  \n\t"
00198             "psraw     $6, %%mm1  \n\t"
00199             "psraw     $6, %%mm0  \n\t"
00200 
00201             "movq   %%mm7,    (%0)  \n\t"
00202             "movq   %%mm5,  16(%0)  \n\t"
00203             "movq   %%mm3,  32(%0)  \n\t"
00204             "movq   %%mm1,  48(%0)  \n\t"
00205             "movq   %%mm0,  64(%0)  \n\t"
00206             "movq   %%mm2,  80(%0)  \n\t"
00207             "movq   %%mm4,  96(%0)  \n\t"
00208             "movq   %%mm6, 112(%0)  \n\t"
00209             :: "r"(b2+4*i)
00210             : "memory"
00211         );
00212     }
00213 
00214     add_pixels_clamped_mmx(b2, dst, stride);
00215 }
00216 
00217 #define STORE_DIFF_8P( p, d, t, z )\
00218         "movq       "#d", "#t" \n"\
00219         "psraw       $6,  "#p" \n"\
00220         "punpcklbw  "#z", "#t" \n"\
00221         "paddsw     "#t", "#p" \n"\
00222         "packuswb   "#p", "#p" \n"\
00223         "movq       "#p", "#d" \n"
00224 
00225 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
00226         "movdqa     "#c", "#a" \n"\
00227         "movdqa     "#g", "#e" \n"\
00228         "psraw       $1,  "#c" \n"\
00229         "psraw       $1,  "#g" \n"\
00230         "psubw      "#e", "#c" \n"\
00231         "paddw      "#a", "#g" \n"\
00232         "movdqa     "#b", "#e" \n"\
00233         "psraw       $1,  "#e" \n"\
00234         "paddw      "#b", "#e" \n"\
00235         "paddw      "#d", "#e" \n"\
00236         "paddw      "#f", "#e" \n"\
00237         "movdqa     "#f", "#a" \n"\
00238         "psraw       $1,  "#a" \n"\
00239         "paddw      "#f", "#a" \n"\
00240         "paddw      "#h", "#a" \n"\
00241         "psubw      "#b", "#a" \n"\
00242         "psubw      "#d", "#b" \n"\
00243         "psubw      "#d", "#f" \n"\
00244         "paddw      "#h", "#b" \n"\
00245         "psubw      "#h", "#f" \n"\
00246         "psraw       $1,  "#d" \n"\
00247         "psraw       $1,  "#h" \n"\
00248         "psubw      "#d", "#b" \n"\
00249         "psubw      "#h", "#f" \n"\
00250         "movdqa     "#e", "#d" \n"\
00251         "movdqa     "#a", "#h" \n"\
00252         "psraw       $2,  "#d" \n"\
00253         "psraw       $2,  "#h" \n"\
00254         "paddw      "#f", "#d" \n"\
00255         "paddw      "#b", "#h" \n"\
00256         "psraw       $2,  "#f" \n"\
00257         "psraw       $2,  "#b" \n"\
00258         "psubw      "#f", "#e" \n"\
00259         "psubw      "#a", "#b" \n"\
00260         "movdqa 0x00(%1), "#a" \n"\
00261         "movdqa 0x40(%1), "#f" \n"\
00262         SUMSUB_BA(f, a)\
00263         SUMSUB_BA(g, f)\
00264         SUMSUB_BA(c, a)\
00265         SUMSUB_BA(e, g)\
00266         SUMSUB_BA(b, c)\
00267         SUMSUB_BA(h, a)\
00268         SUMSUB_BA(d, f)
00269 
00270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
00271 {
00272     __asm__ volatile(
00273         "movdqa   0x10(%1), %%xmm1 \n"
00274         "movdqa   0x20(%1), %%xmm2 \n"
00275         "movdqa   0x30(%1), %%xmm3 \n"
00276         "movdqa   0x50(%1), %%xmm5 \n"
00277         "movdqa   0x60(%1), %%xmm6 \n"
00278         "movdqa   0x70(%1), %%xmm7 \n"
00279         H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
00280         TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
00281         "paddw          %4, %%xmm4 \n"
00282         "movdqa     %%xmm4, 0x00(%1) \n"
00283         "movdqa     %%xmm2, 0x40(%1) \n"
00284         H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
00285         "movdqa     %%xmm6, 0x60(%1) \n"
00286         "movdqa     %%xmm7, 0x70(%1) \n"
00287         "pxor       %%xmm7, %%xmm7 \n"
00288         STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
00289         STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
00290         STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
00291         STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
00292         "lea     (%0,%2,4), %0 \n"
00293         STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
00294         STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
00295         "movdqa   0x60(%1), %%xmm0 \n"
00296         "movdqa   0x70(%1), %%xmm1 \n"
00297         STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
00298         STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
00299         :"+r"(dst)
00300         :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
00301     );
00302 }
00303 
00304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00305 {
00306     int dc = (block[0] + 32) >> 6;
00307     __asm__ volatile(
00308         "movd          %0, %%mm0 \n\t"
00309         "pshufw $0, %%mm0, %%mm0 \n\t"
00310         "pxor       %%mm1, %%mm1 \n\t"
00311         "psubw      %%mm0, %%mm1 \n\t"
00312         "packuswb   %%mm0, %%mm0 \n\t"
00313         "packuswb   %%mm1, %%mm1 \n\t"
00314         ::"r"(dc)
00315     );
00316     __asm__ volatile(
00317         "movd          %0, %%mm2 \n\t"
00318         "movd          %1, %%mm3 \n\t"
00319         "movd          %2, %%mm4 \n\t"
00320         "movd          %3, %%mm5 \n\t"
00321         "paddusb    %%mm0, %%mm2 \n\t"
00322         "paddusb    %%mm0, %%mm3 \n\t"
00323         "paddusb    %%mm0, %%mm4 \n\t"
00324         "paddusb    %%mm0, %%mm5 \n\t"
00325         "psubusb    %%mm1, %%mm2 \n\t"
00326         "psubusb    %%mm1, %%mm3 \n\t"
00327         "psubusb    %%mm1, %%mm4 \n\t"
00328         "psubusb    %%mm1, %%mm5 \n\t"
00329         "movd       %%mm2, %0    \n\t"
00330         "movd       %%mm3, %1    \n\t"
00331         "movd       %%mm4, %2    \n\t"
00332         "movd       %%mm5, %3    \n\t"
00333         :"+m"(*(uint32_t*)(dst+0*stride)),
00334          "+m"(*(uint32_t*)(dst+1*stride)),
00335          "+m"(*(uint32_t*)(dst+2*stride)),
00336          "+m"(*(uint32_t*)(dst+3*stride))
00337     );
00338 }
00339 
00340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00341 {
00342     int dc = (block[0] + 32) >> 6;
00343     int y;
00344     __asm__ volatile(
00345         "movd          %0, %%mm0 \n\t"
00346         "pshufw $0, %%mm0, %%mm0 \n\t"
00347         "pxor       %%mm1, %%mm1 \n\t"
00348         "psubw      %%mm0, %%mm1 \n\t"
00349         "packuswb   %%mm0, %%mm0 \n\t"
00350         "packuswb   %%mm1, %%mm1 \n\t"
00351         ::"r"(dc)
00352     );
00353     for(y=2; y--; dst += 4*stride){
00354     __asm__ volatile(
00355         "movq          %0, %%mm2 \n\t"
00356         "movq          %1, %%mm3 \n\t"
00357         "movq          %2, %%mm4 \n\t"
00358         "movq          %3, %%mm5 \n\t"
00359         "paddusb    %%mm0, %%mm2 \n\t"
00360         "paddusb    %%mm0, %%mm3 \n\t"
00361         "paddusb    %%mm0, %%mm4 \n\t"
00362         "paddusb    %%mm0, %%mm5 \n\t"
00363         "psubusb    %%mm1, %%mm2 \n\t"
00364         "psubusb    %%mm1, %%mm3 \n\t"
00365         "psubusb    %%mm1, %%mm4 \n\t"
00366         "psubusb    %%mm1, %%mm5 \n\t"
00367         "movq       %%mm2, %0    \n\t"
00368         "movq       %%mm3, %1    \n\t"
00369         "movq       %%mm4, %2    \n\t"
00370         "movq       %%mm5, %3    \n\t"
00371         :"+m"(*(uint64_t*)(dst+0*stride)),
00372          "+m"(*(uint64_t*)(dst+1*stride)),
00373          "+m"(*(uint64_t*)(dst+2*stride)),
00374          "+m"(*(uint64_t*)(dst+3*stride))
00375     );
00376     }
00377 }
00378 
00379 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
00380 static const uint8_t scan8[16 + 2*4]={
00381  4+1*8, 5+1*8, 4+2*8, 5+2*8,
00382  6+1*8, 7+1*8, 6+2*8, 7+2*8,
00383  4+3*8, 5+3*8, 4+4*8, 5+4*8,
00384  6+3*8, 7+3*8, 6+4*8, 7+4*8,
00385  1+1*8, 2+1*8,
00386  1+2*8, 2+2*8,
00387  1+4*8, 2+4*8,
00388  1+5*8, 2+5*8,
00389 };
00390 
00391 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00392     int i;
00393     for(i=0; i<16; i++){
00394         if(nnzc[ scan8[i] ])
00395             ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00396     }
00397 }
00398 
00399 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00400     int i;
00401     for(i=0; i<16; i+=4){
00402         if(nnzc[ scan8[i] ])
00403             ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
00404     }
00405 }
00406 
00407 
00408 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00409     int i;
00410     for(i=0; i<16; i++){
00411         int nnz = nnzc[ scan8[i] ];
00412         if(nnz){
00413             if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00414             else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
00415         }
00416     }
00417 }
00418 
00419 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00420     int i;
00421     for(i=0; i<16; i++){
00422         if(nnzc[ scan8[i] ] || block[i*16])
00423             ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00424     }
00425 }
00426 
00427 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00428     int i;
00429     for(i=0; i<16; i++){
00430         if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
00431         else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00432     }
00433 }
00434 
00435 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00436     int i;
00437     for(i=0; i<16; i+=4){
00438         int nnz = nnzc[ scan8[i] ];
00439         if(nnz){
00440             if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00441             else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
00442         }
00443     }
00444 }
00445 
00446 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00447     int i;
00448     for(i=0; i<16; i+=4){
00449         int nnz = nnzc[ scan8[i] ];
00450         if(nnz){
00451             if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00452             else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
00453         }
00454     }
00455 }
00456 
00457 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00458     int i;
00459     for(i=16; i<16+8; i++){
00460         if(nnzc[ scan8[i] ] || block[i*16])
00461             ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00462     }
00463 }
00464 
00465 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00466     int i;
00467     for(i=16; i<16+8; i++){
00468         if(nnzc[ scan8[i] ])
00469             ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00470         else if(block[i*16])
00471             ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00472     }
00473 }
00474 
00475 #if CONFIG_GPL && HAVE_YASM
00476 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
00477 {
00478     __asm__ volatile(
00479         "movd             %0, %%mm0 \n\t"   //  0 0 X D
00480         "punpcklwd        %1, %%mm0 \n\t"   //  x X d D
00481         "paddsw           %2, %%mm0 \n\t"
00482         "psraw            $6, %%mm0 \n\t"
00483         "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d D D
00484         "pxor          %%mm1, %%mm1 \n\t"   //  0 0 0 0
00485         "psubw         %%mm0, %%mm1 \n\t"   // -d-d-D-D
00486         "packuswb      %%mm1, %%mm0 \n\t"   // -d-d-D-D d d D D
00487         "pshufw $0xFA, %%mm0, %%mm1 \n\t"   // -d-d-d-d-D-D-D-D
00488         "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d d d D D D D
00489         ::"m"(block[ 0]),
00490           "m"(block[16]),
00491           "m"(ff_pw_32)
00492     );
00493     __asm__ volatile(
00494         "movq          %0, %%mm2 \n\t"
00495         "movq          %1, %%mm3 \n\t"
00496         "movq          %2, %%mm4 \n\t"
00497         "movq          %3, %%mm5 \n\t"
00498         "paddusb    %%mm0, %%mm2 \n\t"
00499         "paddusb    %%mm0, %%mm3 \n\t"
00500         "paddusb    %%mm0, %%mm4 \n\t"
00501         "paddusb    %%mm0, %%mm5 \n\t"
00502         "psubusb    %%mm1, %%mm2 \n\t"
00503         "psubusb    %%mm1, %%mm3 \n\t"
00504         "psubusb    %%mm1, %%mm4 \n\t"
00505         "psubusb    %%mm1, %%mm5 \n\t"
00506         "movq       %%mm2, %0    \n\t"
00507         "movq       %%mm3, %1    \n\t"
00508         "movq       %%mm4, %2    \n\t"
00509         "movq       %%mm5, %3    \n\t"
00510         :"+m"(*(uint64_t*)(dst+0*stride)),
00511          "+m"(*(uint64_t*)(dst+1*stride)),
00512          "+m"(*(uint64_t*)(dst+2*stride)),
00513          "+m"(*(uint64_t*)(dst+3*stride))
00514     );
00515 }
00516 
00517 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
00518 
00519 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00520     int i;
00521     for(i=0; i<16; i+=2)
00522         if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00523             ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00524 }
00525 
00526 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00527     int i;
00528     for(i=0; i<16; i+=2){
00529         if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00530             ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00531         else if(block[i*16]|block[i*16+16])
00532             ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
00533     }
00534 }
00535 
00536 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00537     int i;
00538     for(i=16; i<16+8; i+=2){
00539         if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00540             ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00541         else if(block[i*16]|block[i*16+16])
00542             ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00543     }
00544 }
00545 #endif
00546 
00547 /***********************************/
00548 /* deblocking */
00549 
00550 // out: o = |x-y|>a
00551 // clobbers: t
00552 #define DIFF_GT_MMX(x,y,a,o,t)\
00553     "movq     "#y", "#t"  \n\t"\
00554     "movq     "#x", "#o"  \n\t"\
00555     "psubusb  "#x", "#t"  \n\t"\
00556     "psubusb  "#y", "#o"  \n\t"\
00557     "por      "#t", "#o"  \n\t"\
00558     "psubusb  "#a", "#o"  \n\t"
00559 
00560 // out: o = |x-y|>a
00561 // clobbers: t
00562 #define DIFF_GT2_MMX(x,y,a,o,t)\
00563     "movq     "#y", "#t"  \n\t"\
00564     "movq     "#x", "#o"  \n\t"\
00565     "psubusb  "#x", "#t"  \n\t"\
00566     "psubusb  "#y", "#o"  \n\t"\
00567     "psubusb  "#a", "#t"  \n\t"\
00568     "psubusb  "#a", "#o"  \n\t"\
00569     "pcmpeqb  "#t", "#o"  \n\t"\
00570 
00571 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
00572 // out: mm5=beta-1, mm7=mask
00573 // clobbers: mm4,mm6
00574 #define H264_DEBLOCK_MASK(alpha1, beta1) \
00575     "pshufw $0, "#alpha1", %%mm4 \n\t"\
00576     "pshufw $0, "#beta1 ", %%mm5 \n\t"\
00577     "packuswb  %%mm4, %%mm4      \n\t"\
00578     "packuswb  %%mm5, %%mm5      \n\t"\
00579     DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
00580     DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
00581     "por       %%mm4, %%mm7      \n\t"\
00582     DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
00583     "por       %%mm4, %%mm7      \n\t"\
00584     "pxor      %%mm6, %%mm6      \n\t"\
00585     "pcmpeqb   %%mm6, %%mm7      \n\t"
00586 
00587 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
00588 // out: mm1=p0' mm2=q0'
00589 // clobbers: mm0,3-6
00590 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
00591         "movq    %%mm1              , %%mm5 \n\t"\
00592         "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
00593         "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
00594         "pcmpeqb %%mm4              , %%mm4 \n\t"\
00595         "pxor    %%mm4              , %%mm3 \n\t"\
00596         "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
00597         "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
00598         "pxor    %%mm1              , %%mm4 \n\t"\
00599         "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
00600         "pavgb   %%mm5              , %%mm3 \n\t"\
00601         "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
00602         "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
00603         "psubusb %%mm3              , %%mm6 \n\t"\
00604         "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
00605         "pminub  %%mm7              , %%mm6 \n\t"\
00606         "pminub  %%mm7              , %%mm3 \n\t"\
00607         "psubusb %%mm6              , %%mm1 \n\t"\
00608         "psubusb %%mm3              , %%mm2 \n\t"\
00609         "paddusb %%mm3              , %%mm1 \n\t"\
00610         "paddusb %%mm6              , %%mm2 \n\t"
00611 
00612 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
00613 // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
00614 // clobbers: q2, tmp, tc0
00615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
00616         "movq     %%mm1,  "#tmp"   \n\t"\
00617         "pavgb    %%mm2,  "#tmp"   \n\t"\
00618         "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
00619         "pxor   "q2addr", "#tmp"   \n\t"\
00620         "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
00621         "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
00622         "movq     "#p1",  "#tmp"   \n\t"\
00623         "psubusb  "#tc0", "#tmp"   \n\t"\
00624         "paddusb  "#p1",  "#tc0"   \n\t"\
00625         "pmaxub   "#tmp", "#q2"    \n\t"\
00626         "pminub   "#tc0", "#q2"    \n\t"\
00627         "movq     "#q2",  "q1addr" \n\t"
00628 
00629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00630 {
00631     DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
00632 
00633     __asm__ volatile(
00634         "movq    (%2,%4), %%mm0    \n\t" //p1
00635         "movq    (%2,%4,2), %%mm1  \n\t" //p0
00636         "movq    (%3),    %%mm2    \n\t" //q0
00637         "movq    (%3,%4), %%mm3    \n\t" //q1
00638         H264_DEBLOCK_MASK(%7, %8)
00639 
00640         "movd      %6,    %%mm4    \n\t"
00641         "punpcklbw %%mm4, %%mm4    \n\t"
00642         "punpcklwd %%mm4, %%mm4    \n\t"
00643         "pcmpeqb   %%mm3, %%mm3    \n\t"
00644         "movq      %%mm4, %%mm6    \n\t"
00645         "pcmpgtb   %%mm3, %%mm4    \n\t"
00646         "movq      %%mm6, %1       \n\t"
00647         "pand      %%mm4, %%mm7    \n\t"
00648         "movq      %%mm7, %0       \n\t"
00649 
00650         /* filter p1 */
00651         "movq     (%2),   %%mm3    \n\t" //p2
00652         DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
00653         "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
00654         "pand     %1,     %%mm7    \n\t" // mask & tc0
00655         "movq     %%mm7,  %%mm4    \n\t"
00656         "psubb    %%mm6,  %%mm7    \n\t"
00657         "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
00658         H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
00659 
00660         /* filter q1 */
00661         "movq    (%3,%4,2), %%mm4  \n\t" //q2
00662         DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
00663         "pand     %0,     %%mm6    \n\t"
00664         "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
00665         "pand     %%mm6,  %%mm5    \n\t"
00666         "psubb    %%mm6,  %%mm7    \n\t"
00667         "movq    (%3,%4), %%mm3    \n\t"
00668         H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
00669 
00670         /* filter p0, q0 */
00671         H264_DEBLOCK_P0_Q0(%9, unused)
00672         "movq      %%mm1, (%2,%4,2) \n\t"
00673         "movq      %%mm2, (%3)      \n\t"
00674 
00675         : "=m"(tmp0[0]), "=m"(tmp0[1])
00676         : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
00677           "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
00678           "m"(ff_bone)
00679     );
00680 }
00681 
00682 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00683 {
00684     if((tc0[0] & tc0[1]) >= 0)
00685         h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00686     if((tc0[2] & tc0[3]) >= 0)
00687         h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
00688 }
00689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00690 {
00691     //FIXME: could cut some load/stores by merging transpose with filter
00692     // also, it only needs to transpose 6x8
00693     DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
00694     int i;
00695     for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
00696         if((tc0[0] & tc0[1]) < 0)
00697             continue;
00698         transpose4x4(trans,       pix-4,          8, stride);
00699         transpose4x4(trans  +4*8, pix,            8, stride);
00700         transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
00701         transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
00702         h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
00703         transpose4x4(pix-2,          trans  +2*8, stride, 8);
00704         transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
00705     }
00706 }
00707 
00708 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00709 {
00710     __asm__ volatile(
00711         "movq    (%0),    %%mm0     \n\t" //p1
00712         "movq    (%0,%2), %%mm1     \n\t" //p0
00713         "movq    (%1),    %%mm2     \n\t" //q0
00714         "movq    (%1,%2), %%mm3     \n\t" //q1
00715         H264_DEBLOCK_MASK(%4, %5)
00716         "movd      %3,    %%mm6     \n\t"
00717         "punpcklbw %%mm6, %%mm6     \n\t"
00718         "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
00719         H264_DEBLOCK_P0_Q0(%6, %7)
00720         "movq      %%mm1, (%0,%2)   \n\t"
00721         "movq      %%mm2, (%1)      \n\t"
00722 
00723         :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00724            "r"(*(uint32_t*)tc0),
00725            "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
00726     );
00727 }
00728 
00729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00730 {
00731     h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00732 }
00733 
00734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00735 {
00736     //FIXME: could cut some load/stores by merging transpose with filter
00737     DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
00738     transpose4x4(trans, pix-2, 8, stride);
00739     transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00740     h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
00741     transpose4x4(pix-2, trans, stride, 8);
00742     transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00743 }
00744 
00745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2
00746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
00747     "movq    "#p0", %%mm4  \n\t"\
00748     "pxor    "#q1", %%mm4  \n\t"\
00749     "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
00750     "pavgb   "#q1", "#p0"  \n\t"\
00751     "psubusb %%mm4, "#p0"  \n\t"\
00752     "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
00753 
00754 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
00755 {
00756     __asm__ volatile(
00757         "movq    (%0),    %%mm0     \n\t"
00758         "movq    (%0,%2), %%mm1     \n\t"
00759         "movq    (%1),    %%mm2     \n\t"
00760         "movq    (%1,%2), %%mm3     \n\t"
00761         H264_DEBLOCK_MASK(%3, %4)
00762         "movq    %%mm1,   %%mm5     \n\t"
00763         "movq    %%mm2,   %%mm6     \n\t"
00764         H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
00765         H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
00766         "psubb   %%mm5,   %%mm1     \n\t"
00767         "psubb   %%mm6,   %%mm2     \n\t"
00768         "pand    %%mm7,   %%mm1     \n\t"
00769         "pand    %%mm7,   %%mm2     \n\t"
00770         "paddb   %%mm5,   %%mm1     \n\t"
00771         "paddb   %%mm6,   %%mm2     \n\t"
00772         "movq    %%mm1,   (%0,%2)   \n\t"
00773         "movq    %%mm2,   (%1)      \n\t"
00774         :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00775            "m"(alpha1), "m"(beta1), "m"(ff_bone)
00776     );
00777 }
00778 
00779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00780 {
00781     h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
00782 }
00783 
00784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00785 {
00786     //FIXME: could cut some load/stores by merging transpose with filter
00787     DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
00788     transpose4x4(trans, pix-2, 8, stride);
00789     transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00790     h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
00791     transpose4x4(pix-2, trans, stride, 8);
00792     transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00793 }
00794 
00795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00796                                             int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00797     int dir;
00798     __asm__ volatile(
00799         "movq %0, %%mm7 \n"
00800         "movq %1, %%mm6 \n"
00801         ::"m"(ff_pb_1), "m"(ff_pb_3)
00802     );
00803     if(field)
00804         __asm__ volatile(
00805             "movq %0, %%mm6 \n"
00806             ::"m"(ff_pb_3_1)
00807         );
00808     __asm__ volatile(
00809         "movq  %%mm6, %%mm5 \n"
00810         "paddb %%mm5, %%mm5 \n"
00811     :);
00812 
00813     // could do a special case for dir==0 && edges==1, but it only reduces the
00814     // average filter time by 1.2%
00815     for( dir=1; dir>=0; dir-- ) {
00816         const x86_reg d_idx = dir ? -8 : -1;
00817         const int mask_mv = dir ? mask_mv1 : mask_mv0;
00818         DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
00819         int b_idx, edge;
00820         for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
00821             __asm__ volatile(
00822                 "pand %0, %%mm0 \n\t"
00823                 ::"m"(mask_dir)
00824             );
00825             if(!(mask_mv & edge)) {
00826                 if(bidir) {
00827                     __asm__ volatile(
00828                         "movd         (%1,%0), %%mm2 \n"
00829                         "punpckldq  40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
00830                         "pshufw $0x44,   (%1), %%mm0 \n" // { ref0[b], ref0[b] }
00831                         "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
00832                         "pshufw $0x4E, %%mm2, %%mm3 \n"
00833                         "psubb         %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
00834                         "psubb         %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
00835                         "1: \n"
00836                         "por           %%mm1, %%mm0 \n"
00837                         "movq      (%2,%0,4), %%mm1 \n"
00838                         "movq     8(%2,%0,4), %%mm2 \n"
00839                         "movq          %%mm1, %%mm3 \n"
00840                         "movq          %%mm2, %%mm4 \n"
00841                         "psubw          (%2), %%mm1 \n"
00842                         "psubw         8(%2), %%mm2 \n"
00843                         "psubw       160(%2), %%mm3 \n"
00844                         "psubw       168(%2), %%mm4 \n"
00845                         "packsswb      %%mm2, %%mm1 \n"
00846                         "packsswb      %%mm4, %%mm3 \n"
00847                         "paddb         %%mm6, %%mm1 \n"
00848                         "paddb         %%mm6, %%mm3 \n"
00849                         "psubusb       %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
00850                         "psubusb       %%mm5, %%mm3 \n"
00851                         "packsswb      %%mm3, %%mm1 \n"
00852                         "add $40, %0 \n"
00853                         "cmp $40, %0 \n"
00854                         "jl 1b \n"
00855                         "sub $80, %0 \n"
00856                         "pshufw $0x4E, %%mm1, %%mm1 \n"
00857                         "por           %%mm1, %%mm0 \n"
00858                         "pshufw $0x4E, %%mm0, %%mm1 \n"
00859                         "pminub        %%mm1, %%mm0 \n"
00860                         ::"r"(d_idx),
00861                           "r"(ref[0]+b_idx),
00862                           "r"(mv[0]+b_idx)
00863                     );
00864                 } else {
00865                     __asm__ volatile(
00866                         "movd        (%1), %%mm0 \n"
00867                         "psubb    (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
00868                         "movq        (%2), %%mm1 \n"
00869                         "movq       8(%2), %%mm2 \n"
00870                         "psubw  (%2,%0,4), %%mm1 \n"
00871                         "psubw 8(%2,%0,4), %%mm2 \n"
00872                         "packsswb   %%mm2, %%mm1 \n"
00873                         "paddb      %%mm6, %%mm1 \n"
00874                         "psubusb    %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
00875                         "packsswb   %%mm1, %%mm1 \n"
00876                         "por        %%mm1, %%mm0 \n"
00877                         ::"r"(d_idx),
00878                           "r"(ref[0]+b_idx),
00879                           "r"(mv[0]+b_idx)
00880                     );
00881                 }
00882             }
00883             __asm__ volatile(
00884                 "movd %0, %%mm1 \n"
00885                 "por  %1, %%mm1 \n" // nnz[b] || nnz[bn]
00886                 ::"m"(nnz[b_idx]),
00887                   "m"(nnz[b_idx+d_idx])
00888             );
00889             __asm__ volatile(
00890                 "pminub    %%mm7, %%mm1 \n"
00891                 "pminub    %%mm7, %%mm0 \n"
00892                 "psllw        $1, %%mm1 \n"
00893                 "pxor      %%mm2, %%mm2 \n"
00894                 "pmaxub    %%mm0, %%mm1 \n"
00895                 "punpcklbw %%mm2, %%mm1 \n"
00896                 "movq      %%mm1, %0    \n"
00897                 :"=m"(*bS[dir][edge])
00898                 ::"memory"
00899             );
00900         }
00901         edges = 4;
00902         step = 1;
00903     }
00904     __asm__ volatile(
00905         "movq   (%0), %%mm0 \n\t"
00906         "movq  8(%0), %%mm1 \n\t"
00907         "movq 16(%0), %%mm2 \n\t"
00908         "movq 24(%0), %%mm3 \n\t"
00909         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00910         "movq %%mm0,   (%0) \n\t"
00911         "movq %%mm3,  8(%0) \n\t"
00912         "movq %%mm4, 16(%0) \n\t"
00913         "movq %%mm2, 24(%0) \n\t"
00914         ::"r"(bS[0])
00915         :"memory"
00916     );
00917 }
00918 
00919 /***********************************/
00920 /* motion compensation */
00921 
00922 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
00923         "mov"#q" "#C", "#T"         \n\t"\
00924         "mov"#d" (%0), "#F"         \n\t"\
00925         "paddw "#D", "#T"           \n\t"\
00926         "psllw $2, "#T"             \n\t"\
00927         "psubw "#B", "#T"           \n\t"\
00928         "psubw "#E", "#T"           \n\t"\
00929         "punpcklbw "#Z", "#F"       \n\t"\
00930         "pmullw %4, "#T"            \n\t"\
00931         "paddw %5, "#A"             \n\t"\
00932         "add %2, %0                 \n\t"\
00933         "paddw "#F", "#A"           \n\t"\
00934         "paddw "#A", "#T"           \n\t"\
00935         "psraw $5, "#T"             \n\t"\
00936         "packuswb "#T", "#T"        \n\t"\
00937         OP(T, (%1), A, d)\
00938         "add %3, %1                 \n\t"
00939 
00940 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
00941         "mov"#q" "#C", "#T"         \n\t"\
00942         "mov"#d" (%0), "#F"         \n\t"\
00943         "paddw "#D", "#T"           \n\t"\
00944         "psllw $2, "#T"             \n\t"\
00945         "paddw %4, "#A"             \n\t"\
00946         "psubw "#B", "#T"           \n\t"\
00947         "psubw "#E", "#T"           \n\t"\
00948         "punpcklbw "#Z", "#F"       \n\t"\
00949         "pmullw %3, "#T"            \n\t"\
00950         "paddw "#F", "#A"           \n\t"\
00951         "add %2, %0                 \n\t"\
00952         "paddw "#A", "#T"           \n\t"\
00953         "mov"#q" "#T", "#OF"(%1)    \n\t"
00954 
00955 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
00956 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
00957 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
00958 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
00959 
00960 
00961 #define QPEL_H264(OPNAME, OP, MMX)\
00962 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00963     int h=4;\
00964 \
00965     __asm__ volatile(\
00966         "pxor %%mm7, %%mm7          \n\t"\
00967         "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
00968         "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
00969         "1:                         \n\t"\
00970         "movd  -1(%0), %%mm1        \n\t"\
00971         "movd    (%0), %%mm2        \n\t"\
00972         "movd   1(%0), %%mm3        \n\t"\
00973         "movd   2(%0), %%mm0        \n\t"\
00974         "punpcklbw %%mm7, %%mm1     \n\t"\
00975         "punpcklbw %%mm7, %%mm2     \n\t"\
00976         "punpcklbw %%mm7, %%mm3     \n\t"\
00977         "punpcklbw %%mm7, %%mm0     \n\t"\
00978         "paddw %%mm0, %%mm1         \n\t"\
00979         "paddw %%mm3, %%mm2         \n\t"\
00980         "movd  -2(%0), %%mm0        \n\t"\
00981         "movd   3(%0), %%mm3        \n\t"\
00982         "punpcklbw %%mm7, %%mm0     \n\t"\
00983         "punpcklbw %%mm7, %%mm3     \n\t"\
00984         "paddw %%mm3, %%mm0         \n\t"\
00985         "psllw $2, %%mm2            \n\t"\
00986         "psubw %%mm1, %%mm2         \n\t"\
00987         "pmullw %%mm4, %%mm2        \n\t"\
00988         "paddw %%mm5, %%mm0         \n\t"\
00989         "paddw %%mm2, %%mm0         \n\t"\
00990         "psraw $5, %%mm0            \n\t"\
00991         "packuswb %%mm0, %%mm0      \n\t"\
00992         OP(%%mm0, (%1),%%mm6, d)\
00993         "add %3, %0                 \n\t"\
00994         "add %4, %1                 \n\t"\
00995         "decl %2                    \n\t"\
00996         " jnz 1b                    \n\t"\
00997         : "+a"(src), "+c"(dst), "+g"(h)\
00998         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
00999         : "memory"\
01000     );\
01001 }\
01002 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01003     int h=4;\
01004     __asm__ volatile(\
01005         "pxor %%mm7, %%mm7          \n\t"\
01006         "movq %0, %%mm4             \n\t"\
01007         "movq %1, %%mm5             \n\t"\
01008         :: "m"(ff_pw_5), "m"(ff_pw_16)\
01009     );\
01010     do{\
01011     __asm__ volatile(\
01012         "movd  -1(%0), %%mm1        \n\t"\
01013         "movd    (%0), %%mm2        \n\t"\
01014         "movd   1(%0), %%mm3        \n\t"\
01015         "movd   2(%0), %%mm0        \n\t"\
01016         "punpcklbw %%mm7, %%mm1     \n\t"\
01017         "punpcklbw %%mm7, %%mm2     \n\t"\
01018         "punpcklbw %%mm7, %%mm3     \n\t"\
01019         "punpcklbw %%mm7, %%mm0     \n\t"\
01020         "paddw %%mm0, %%mm1         \n\t"\
01021         "paddw %%mm3, %%mm2         \n\t"\
01022         "movd  -2(%0), %%mm0        \n\t"\
01023         "movd   3(%0), %%mm3        \n\t"\
01024         "punpcklbw %%mm7, %%mm0     \n\t"\
01025         "punpcklbw %%mm7, %%mm3     \n\t"\
01026         "paddw %%mm3, %%mm0         \n\t"\
01027         "psllw $2, %%mm2            \n\t"\
01028         "psubw %%mm1, %%mm2         \n\t"\
01029         "pmullw %%mm4, %%mm2        \n\t"\
01030         "paddw %%mm5, %%mm0         \n\t"\
01031         "paddw %%mm2, %%mm0         \n\t"\
01032         "movd   (%2), %%mm3         \n\t"\
01033         "psraw $5, %%mm0            \n\t"\
01034         "packuswb %%mm0, %%mm0      \n\t"\
01035         PAVGB" %%mm3, %%mm0         \n\t"\
01036         OP(%%mm0, (%1),%%mm6, d)\
01037         "add %4, %0                 \n\t"\
01038         "add %4, %1                 \n\t"\
01039         "add %3, %2                 \n\t"\
01040         : "+a"(src), "+c"(dst), "+d"(src2)\
01041         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
01042         : "memory"\
01043     );\
01044     }while(--h);\
01045 }\
01046 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01047     src -= 2*srcStride;\
01048     __asm__ volatile(\
01049         "pxor %%mm7, %%mm7          \n\t"\
01050         "movd (%0), %%mm0           \n\t"\
01051         "add %2, %0                 \n\t"\
01052         "movd (%0), %%mm1           \n\t"\
01053         "add %2, %0                 \n\t"\
01054         "movd (%0), %%mm2           \n\t"\
01055         "add %2, %0                 \n\t"\
01056         "movd (%0), %%mm3           \n\t"\
01057         "add %2, %0                 \n\t"\
01058         "movd (%0), %%mm4           \n\t"\
01059         "add %2, %0                 \n\t"\
01060         "punpcklbw %%mm7, %%mm0     \n\t"\
01061         "punpcklbw %%mm7, %%mm1     \n\t"\
01062         "punpcklbw %%mm7, %%mm2     \n\t"\
01063         "punpcklbw %%mm7, %%mm3     \n\t"\
01064         "punpcklbw %%mm7, %%mm4     \n\t"\
01065         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01066         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01067         QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01068         QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01069          \
01070         : "+a"(src), "+c"(dst)\
01071         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01072         : "memory"\
01073     );\
01074 }\
01075 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01076     int h=4;\
01077     int w=3;\
01078     src -= 2*srcStride+2;\
01079     while(w--){\
01080         __asm__ volatile(\
01081             "pxor %%mm7, %%mm7      \n\t"\
01082             "movd (%0), %%mm0       \n\t"\
01083             "add %2, %0             \n\t"\
01084             "movd (%0), %%mm1       \n\t"\
01085             "add %2, %0             \n\t"\
01086             "movd (%0), %%mm2       \n\t"\
01087             "add %2, %0             \n\t"\
01088             "movd (%0), %%mm3       \n\t"\
01089             "add %2, %0             \n\t"\
01090             "movd (%0), %%mm4       \n\t"\
01091             "add %2, %0             \n\t"\
01092             "punpcklbw %%mm7, %%mm0 \n\t"\
01093             "punpcklbw %%mm7, %%mm1 \n\t"\
01094             "punpcklbw %%mm7, %%mm2 \n\t"\
01095             "punpcklbw %%mm7, %%mm3 \n\t"\
01096             "punpcklbw %%mm7, %%mm4 \n\t"\
01097             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
01098             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
01099             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
01100             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
01101              \
01102             : "+a"(src)\
01103             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01104             : "memory"\
01105         );\
01106         tmp += 4;\
01107         src += 4 - 9*srcStride;\
01108     }\
01109     tmp -= 3*4;\
01110     __asm__ volatile(\
01111         "1:                         \n\t"\
01112         "movq     (%0), %%mm0       \n\t"\
01113         "paddw  10(%0), %%mm0       \n\t"\
01114         "movq    2(%0), %%mm1       \n\t"\
01115         "paddw   8(%0), %%mm1       \n\t"\
01116         "movq    4(%0), %%mm2       \n\t"\
01117         "paddw   6(%0), %%mm2       \n\t"\
01118         "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
01119         "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
01120         "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
01121         "paddsw %%mm2, %%mm0        \n\t"\
01122         "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
01123         "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
01124         "psraw $6, %%mm0            \n\t"\
01125         "packuswb %%mm0, %%mm0      \n\t"\
01126         OP(%%mm0, (%1),%%mm7, d)\
01127         "add $24, %0                \n\t"\
01128         "add %3, %1                 \n\t"\
01129         "decl %2                    \n\t"\
01130         " jnz 1b                    \n\t"\
01131         : "+a"(tmp), "+c"(dst), "+g"(h)\
01132         : "S"((x86_reg)dstStride)\
01133         : "memory"\
01134     );\
01135 }\
01136 \
01137 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01138     int h=8;\
01139     __asm__ volatile(\
01140         "pxor %%mm7, %%mm7          \n\t"\
01141         "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
01142         "1:                         \n\t"\
01143         "movq    (%0), %%mm0        \n\t"\
01144         "movq   1(%0), %%mm2        \n\t"\
01145         "movq %%mm0, %%mm1          \n\t"\
01146         "movq %%mm2, %%mm3          \n\t"\
01147         "punpcklbw %%mm7, %%mm0     \n\t"\
01148         "punpckhbw %%mm7, %%mm1     \n\t"\
01149         "punpcklbw %%mm7, %%mm2     \n\t"\
01150         "punpckhbw %%mm7, %%mm3     \n\t"\
01151         "paddw %%mm2, %%mm0         \n\t"\
01152         "paddw %%mm3, %%mm1         \n\t"\
01153         "psllw $2, %%mm0            \n\t"\
01154         "psllw $2, %%mm1            \n\t"\
01155         "movq   -1(%0), %%mm2       \n\t"\
01156         "movq    2(%0), %%mm4       \n\t"\
01157         "movq %%mm2, %%mm3          \n\t"\
01158         "movq %%mm4, %%mm5          \n\t"\
01159         "punpcklbw %%mm7, %%mm2     \n\t"\
01160         "punpckhbw %%mm7, %%mm3     \n\t"\
01161         "punpcklbw %%mm7, %%mm4     \n\t"\
01162         "punpckhbw %%mm7, %%mm5     \n\t"\
01163         "paddw %%mm4, %%mm2         \n\t"\
01164         "paddw %%mm3, %%mm5         \n\t"\
01165         "psubw %%mm2, %%mm0         \n\t"\
01166         "psubw %%mm5, %%mm1         \n\t"\
01167         "pmullw %%mm6, %%mm0        \n\t"\
01168         "pmullw %%mm6, %%mm1        \n\t"\
01169         "movd   -2(%0), %%mm2       \n\t"\
01170         "movd    7(%0), %%mm5       \n\t"\
01171         "punpcklbw %%mm7, %%mm2     \n\t"\
01172         "punpcklbw %%mm7, %%mm5     \n\t"\
01173         "paddw %%mm3, %%mm2         \n\t"\
01174         "paddw %%mm5, %%mm4         \n\t"\
01175         "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
01176         "paddw %%mm5, %%mm2         \n\t"\
01177         "paddw %%mm5, %%mm4         \n\t"\
01178         "paddw %%mm2, %%mm0         \n\t"\
01179         "paddw %%mm4, %%mm1         \n\t"\
01180         "psraw $5, %%mm0            \n\t"\
01181         "psraw $5, %%mm1            \n\t"\
01182         "packuswb %%mm1, %%mm0      \n\t"\
01183         OP(%%mm0, (%1),%%mm5, q)\
01184         "add %3, %0                 \n\t"\
01185         "add %4, %1                 \n\t"\
01186         "decl %2                    \n\t"\
01187         " jnz 1b                    \n\t"\
01188         : "+a"(src), "+c"(dst), "+g"(h)\
01189         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
01190         : "memory"\
01191     );\
01192 }\
01193 \
01194 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01195     int h=8;\
01196     __asm__ volatile(\
01197         "pxor %%mm7, %%mm7          \n\t"\
01198         "movq %0, %%mm6             \n\t"\
01199         :: "m"(ff_pw_5)\
01200     );\
01201     do{\
01202     __asm__ volatile(\
01203         "movq    (%0), %%mm0        \n\t"\
01204         "movq   1(%0), %%mm2        \n\t"\
01205         "movq %%mm0, %%mm1          \n\t"\
01206         "movq %%mm2, %%mm3          \n\t"\
01207         "punpcklbw %%mm7, %%mm0     \n\t"\
01208         "punpckhbw %%mm7, %%mm1     \n\t"\
01209         "punpcklbw %%mm7, %%mm2     \n\t"\
01210         "punpckhbw %%mm7, %%mm3     \n\t"\
01211         "paddw %%mm2, %%mm0         \n\t"\
01212         "paddw %%mm3, %%mm1         \n\t"\
01213         "psllw $2, %%mm0            \n\t"\
01214         "psllw $2, %%mm1            \n\t"\
01215         "movq   -1(%0), %%mm2       \n\t"\
01216         "movq    2(%0), %%mm4       \n\t"\
01217         "movq %%mm2, %%mm3          \n\t"\
01218         "movq %%mm4, %%mm5          \n\t"\
01219         "punpcklbw %%mm7, %%mm2     \n\t"\
01220         "punpckhbw %%mm7, %%mm3     \n\t"\
01221         "punpcklbw %%mm7, %%mm4     \n\t"\
01222         "punpckhbw %%mm7, %%mm5     \n\t"\
01223         "paddw %%mm4, %%mm2         \n\t"\
01224         "paddw %%mm3, %%mm5         \n\t"\
01225         "psubw %%mm2, %%mm0         \n\t"\
01226         "psubw %%mm5, %%mm1         \n\t"\
01227         "pmullw %%mm6, %%mm0        \n\t"\
01228         "pmullw %%mm6, %%mm1        \n\t"\
01229         "movd   -2(%0), %%mm2       \n\t"\
01230         "movd    7(%0), %%mm5       \n\t"\
01231         "punpcklbw %%mm7, %%mm2     \n\t"\
01232         "punpcklbw %%mm7, %%mm5     \n\t"\
01233         "paddw %%mm3, %%mm2         \n\t"\
01234         "paddw %%mm5, %%mm4         \n\t"\
01235         "movq %5, %%mm5             \n\t"\
01236         "paddw %%mm5, %%mm2         \n\t"\
01237         "paddw %%mm5, %%mm4         \n\t"\
01238         "paddw %%mm2, %%mm0         \n\t"\
01239         "paddw %%mm4, %%mm1         \n\t"\
01240         "psraw $5, %%mm0            \n\t"\
01241         "psraw $5, %%mm1            \n\t"\
01242         "movq (%2), %%mm4           \n\t"\
01243         "packuswb %%mm1, %%mm0      \n\t"\
01244         PAVGB" %%mm4, %%mm0         \n\t"\
01245         OP(%%mm0, (%1),%%mm5, q)\
01246         "add %4, %0                 \n\t"\
01247         "add %4, %1                 \n\t"\
01248         "add %3, %2                 \n\t"\
01249         : "+a"(src), "+c"(dst), "+d"(src2)\
01250         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01251           "m"(ff_pw_16)\
01252         : "memory"\
01253     );\
01254     }while(--h);\
01255 }\
01256 \
01257 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01258     int w= 2;\
01259     src -= 2*srcStride;\
01260     \
01261     while(w--){\
01262       __asm__ volatile(\
01263         "pxor %%mm7, %%mm7          \n\t"\
01264         "movd (%0), %%mm0           \n\t"\
01265         "add %2, %0                 \n\t"\
01266         "movd (%0), %%mm1           \n\t"\
01267         "add %2, %0                 \n\t"\
01268         "movd (%0), %%mm2           \n\t"\
01269         "add %2, %0                 \n\t"\
01270         "movd (%0), %%mm3           \n\t"\
01271         "add %2, %0                 \n\t"\
01272         "movd (%0), %%mm4           \n\t"\
01273         "add %2, %0                 \n\t"\
01274         "punpcklbw %%mm7, %%mm0     \n\t"\
01275         "punpcklbw %%mm7, %%mm1     \n\t"\
01276         "punpcklbw %%mm7, %%mm2     \n\t"\
01277         "punpcklbw %%mm7, %%mm3     \n\t"\
01278         "punpcklbw %%mm7, %%mm4     \n\t"\
01279         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01280         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01281         QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01282         QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01283         QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01284         QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01285         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01286         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01287          \
01288         : "+a"(src), "+c"(dst)\
01289         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01290         : "memory"\
01291      );\
01292      if(h==16){\
01293         __asm__ volatile(\
01294             QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01295             QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01296             QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01297             QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01298             QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01299             QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01300             QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01301             QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01302             \
01303            : "+a"(src), "+c"(dst)\
01304            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01305            : "memory"\
01306         );\
01307      }\
01308      src += 4-(h+5)*srcStride;\
01309      dst += 4-h*dstStride;\
01310    }\
01311 }\
01312 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
01313     int w = (size+8)>>2;\
01314     src -= 2*srcStride+2;\
01315     while(w--){\
01316         __asm__ volatile(\
01317             "pxor %%mm7, %%mm7      \n\t"\
01318             "movd (%0), %%mm0       \n\t"\
01319             "add %2, %0             \n\t"\
01320             "movd (%0), %%mm1       \n\t"\
01321             "add %2, %0             \n\t"\
01322             "movd (%0), %%mm2       \n\t"\
01323             "add %2, %0             \n\t"\
01324             "movd (%0), %%mm3       \n\t"\
01325             "add %2, %0             \n\t"\
01326             "movd (%0), %%mm4       \n\t"\
01327             "add %2, %0             \n\t"\
01328             "punpcklbw %%mm7, %%mm0 \n\t"\
01329             "punpcklbw %%mm7, %%mm1 \n\t"\
01330             "punpcklbw %%mm7, %%mm2 \n\t"\
01331             "punpcklbw %%mm7, %%mm3 \n\t"\
01332             "punpcklbw %%mm7, %%mm4 \n\t"\
01333             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
01334             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
01335             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
01336             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
01337             QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
01338             QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
01339             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
01340             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
01341             : "+a"(src)\
01342             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01343             : "memory"\
01344         );\
01345         if(size==16){\
01346             __asm__ volatile(\
01347                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
01348                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
01349                 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
01350                 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
01351                 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
01352                 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
01353                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
01354                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
01355                 : "+a"(src)\
01356                 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01357                 : "memory"\
01358             );\
01359         }\
01360         tmp += 4;\
01361         src += 4 - (size+5)*srcStride;\
01362     }\
01363 }\
01364 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01365     int w = size>>4;\
01366     do{\
01367     int h = size;\
01368     __asm__ volatile(\
01369         "1:                         \n\t"\
01370         "movq     (%0), %%mm0       \n\t"\
01371         "movq    8(%0), %%mm3       \n\t"\
01372         "movq    2(%0), %%mm1       \n\t"\
01373         "movq   10(%0), %%mm4       \n\t"\
01374         "paddw   %%mm4, %%mm0       \n\t"\
01375         "paddw   %%mm3, %%mm1       \n\t"\
01376         "paddw  18(%0), %%mm3       \n\t"\
01377         "paddw  16(%0), %%mm4       \n\t"\
01378         "movq    4(%0), %%mm2       \n\t"\
01379         "movq   12(%0), %%mm5       \n\t"\
01380         "paddw   6(%0), %%mm2       \n\t"\
01381         "paddw  14(%0), %%mm5       \n\t"\
01382         "psubw %%mm1, %%mm0         \n\t"\
01383         "psubw %%mm4, %%mm3         \n\t"\
01384         "psraw $2, %%mm0            \n\t"\
01385         "psraw $2, %%mm3            \n\t"\
01386         "psubw %%mm1, %%mm0         \n\t"\
01387         "psubw %%mm4, %%mm3         \n\t"\
01388         "paddsw %%mm2, %%mm0        \n\t"\
01389         "paddsw %%mm5, %%mm3        \n\t"\
01390         "psraw $2, %%mm0            \n\t"\
01391         "psraw $2, %%mm3            \n\t"\
01392         "paddw %%mm2, %%mm0         \n\t"\
01393         "paddw %%mm5, %%mm3         \n\t"\
01394         "psraw $6, %%mm0            \n\t"\
01395         "psraw $6, %%mm3            \n\t"\
01396         "packuswb %%mm3, %%mm0      \n\t"\
01397         OP(%%mm0, (%1),%%mm7, q)\
01398         "add $48, %0                \n\t"\
01399         "add %3, %1                 \n\t"\
01400         "decl %2                    \n\t"\
01401         " jnz 1b                    \n\t"\
01402         : "+a"(tmp), "+c"(dst), "+g"(h)\
01403         : "S"((x86_reg)dstStride)\
01404         : "memory"\
01405     );\
01406     tmp += 8 - size*24;\
01407     dst += 8 - size*dstStride;\
01408     }while(w--);\
01409 }\
01410 \
01411 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01412     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
01413 }\
01414 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01415     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
01416     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01417 }\
01418 \
01419 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01420     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01421     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01422     src += 8*srcStride;\
01423     dst += 8*dstStride;\
01424     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01425     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01426 }\
01427 \
01428 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01429     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01430     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01431     src += 8*dstStride;\
01432     dst += 8*dstStride;\
01433     src2 += 8*src2Stride;\
01434     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01435     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01436 }\
01437 \
01438 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01439           put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
01440     OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01441 }\
01442 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01443     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
01444 }\
01445 \
01446 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01447     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
01448 }\
01449 \
01450 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01451 {\
01452     __asm__ volatile(\
01453         "movq      (%1), %%mm0          \n\t"\
01454         "movq    24(%1), %%mm1          \n\t"\
01455         "psraw      $5,  %%mm0          \n\t"\
01456         "psraw      $5,  %%mm1          \n\t"\
01457         "packuswb %%mm0, %%mm0          \n\t"\
01458         "packuswb %%mm1, %%mm1          \n\t"\
01459         PAVGB"     (%0), %%mm0          \n\t"\
01460         PAVGB"  (%0,%3), %%mm1          \n\t"\
01461         OP(%%mm0, (%2),    %%mm4, d)\
01462         OP(%%mm1, (%2,%4), %%mm5, d)\
01463         "lea  (%0,%3,2), %0             \n\t"\
01464         "lea  (%2,%4,2), %2             \n\t"\
01465         "movq    48(%1), %%mm0          \n\t"\
01466         "movq    72(%1), %%mm1          \n\t"\
01467         "psraw      $5,  %%mm0          \n\t"\
01468         "psraw      $5,  %%mm1          \n\t"\
01469         "packuswb %%mm0, %%mm0          \n\t"\
01470         "packuswb %%mm1, %%mm1          \n\t"\
01471         PAVGB"     (%0), %%mm0          \n\t"\
01472         PAVGB"  (%0,%3), %%mm1          \n\t"\
01473         OP(%%mm0, (%2),    %%mm4, d)\
01474         OP(%%mm1, (%2,%4), %%mm5, d)\
01475         :"+a"(src8), "+c"(src16), "+d"(dst)\
01476         :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
01477         :"memory");\
01478 }\
01479 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01480 {\
01481     do{\
01482     __asm__ volatile(\
01483         "movq      (%1), %%mm0          \n\t"\
01484         "movq     8(%1), %%mm1          \n\t"\
01485         "movq    48(%1), %%mm2          \n\t"\
01486         "movq  8+48(%1), %%mm3          \n\t"\
01487         "psraw      $5,  %%mm0          \n\t"\
01488         "psraw      $5,  %%mm1          \n\t"\
01489         "psraw      $5,  %%mm2          \n\t"\
01490         "psraw      $5,  %%mm3          \n\t"\
01491         "packuswb %%mm1, %%mm0          \n\t"\
01492         "packuswb %%mm3, %%mm2          \n\t"\
01493         PAVGB"     (%0), %%mm0          \n\t"\
01494         PAVGB"  (%0,%3), %%mm2          \n\t"\
01495         OP(%%mm0, (%2), %%mm5, q)\
01496         OP(%%mm2, (%2,%4), %%mm5, q)\
01497         ::"a"(src8), "c"(src16), "d"(dst),\
01498           "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
01499         :"memory");\
01500         src8 += 2L*src8Stride;\
01501         src16 += 48;\
01502         dst += 2L*dstStride;\
01503     }while(h-=2);\
01504 }\
01505 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01506 {\
01507     OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
01508     OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
01509 }\
01510 
01511 
01512 #if ARCH_X86_64
01513 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01514 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01515     int h=16;\
01516     __asm__ volatile(\
01517         "pxor %%xmm15, %%xmm15      \n\t"\
01518         "movdqa %6, %%xmm14         \n\t"\
01519         "movdqa %7, %%xmm13         \n\t"\
01520         "1:                         \n\t"\
01521         "lddqu    6(%0), %%xmm1     \n\t"\
01522         "lddqu   -2(%0), %%xmm7     \n\t"\
01523         "movdqa  %%xmm1, %%xmm0     \n\t"\
01524         "punpckhbw %%xmm15, %%xmm1  \n\t"\
01525         "punpcklbw %%xmm15, %%xmm0  \n\t"\
01526         "punpcklbw %%xmm15, %%xmm7  \n\t"\
01527         "movdqa  %%xmm1, %%xmm2     \n\t"\
01528         "movdqa  %%xmm0, %%xmm6     \n\t"\
01529         "movdqa  %%xmm1, %%xmm3     \n\t"\
01530         "movdqa  %%xmm0, %%xmm8     \n\t"\
01531         "movdqa  %%xmm1, %%xmm4     \n\t"\
01532         "movdqa  %%xmm0, %%xmm9     \n\t"\
01533         "movdqa  %%xmm0, %%xmm12    \n\t"\
01534         "movdqa  %%xmm1, %%xmm11    \n\t"\
01535         "palignr $10,%%xmm0, %%xmm11\n\t"\
01536         "palignr $10,%%xmm7, %%xmm12\n\t"\
01537         "palignr $2, %%xmm0, %%xmm4 \n\t"\
01538         "palignr $2, %%xmm7, %%xmm9 \n\t"\
01539         "palignr $4, %%xmm0, %%xmm3 \n\t"\
01540         "palignr $4, %%xmm7, %%xmm8 \n\t"\
01541         "palignr $6, %%xmm0, %%xmm2 \n\t"\
01542         "palignr $6, %%xmm7, %%xmm6 \n\t"\
01543         "paddw   %%xmm0 ,%%xmm11    \n\t"\
01544         "palignr $8, %%xmm0, %%xmm1 \n\t"\
01545         "palignr $8, %%xmm7, %%xmm0 \n\t"\
01546         "paddw   %%xmm12,%%xmm7     \n\t"\
01547         "paddw   %%xmm3, %%xmm2     \n\t"\
01548         "paddw   %%xmm8, %%xmm6     \n\t"\
01549         "paddw   %%xmm4, %%xmm1     \n\t"\
01550         "paddw   %%xmm9, %%xmm0     \n\t"\
01551         "psllw   $2,     %%xmm2     \n\t"\
01552         "psllw   $2,     %%xmm6     \n\t"\
01553         "psubw   %%xmm1, %%xmm2     \n\t"\
01554         "psubw   %%xmm0, %%xmm6     \n\t"\
01555         "paddw   %%xmm13,%%xmm11    \n\t"\
01556         "paddw   %%xmm13,%%xmm7     \n\t"\
01557         "pmullw  %%xmm14,%%xmm2     \n\t"\
01558         "pmullw  %%xmm14,%%xmm6     \n\t"\
01559         "lddqu   (%2),   %%xmm3     \n\t"\
01560         "paddw   %%xmm11,%%xmm2     \n\t"\
01561         "paddw   %%xmm7, %%xmm6     \n\t"\
01562         "psraw   $5,     %%xmm2     \n\t"\
01563         "psraw   $5,     %%xmm6     \n\t"\
01564         "packuswb %%xmm2,%%xmm6     \n\t"\
01565         "pavgb   %%xmm3, %%xmm6     \n\t"\
01566         OP(%%xmm6, (%1), %%xmm4, dqa)\
01567         "add %5, %0                 \n\t"\
01568         "add %5, %1                 \n\t"\
01569         "add %4, %2                 \n\t"\
01570         "decl %3                    \n\t"\
01571         "jg 1b                      \n\t"\
01572         : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
01573         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01574           "m"(ff_pw_5), "m"(ff_pw_16)\
01575         : "memory"\
01576     );\
01577 }
01578 #else // ARCH_X86_64
01579 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01580 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01581     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01582     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01583     src += 8*dstStride;\
01584     dst += 8*dstStride;\
01585     src2 += 8*src2Stride;\
01586     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01587     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01588 }
01589 #endif // ARCH_X86_64
01590 
01591 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
01592 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01593     int h=8;\
01594     __asm__ volatile(\
01595         "pxor %%xmm7, %%xmm7        \n\t"\
01596         "movdqa %0, %%xmm6          \n\t"\
01597         :: "m"(ff_pw_5)\
01598     );\
01599     do{\
01600     __asm__ volatile(\
01601         "lddqu   -2(%0), %%xmm1     \n\t"\
01602         "movdqa  %%xmm1, %%xmm0     \n\t"\
01603         "punpckhbw %%xmm7, %%xmm1   \n\t"\
01604         "punpcklbw %%xmm7, %%xmm0   \n\t"\
01605         "movdqa  %%xmm1, %%xmm2     \n\t"\
01606         "movdqa  %%xmm1, %%xmm3     \n\t"\
01607         "movdqa  %%xmm1, %%xmm4     \n\t"\
01608         "movdqa  %%xmm1, %%xmm5     \n\t"\
01609         "palignr $2, %%xmm0, %%xmm4 \n\t"\
01610         "palignr $4, %%xmm0, %%xmm3 \n\t"\
01611         "palignr $6, %%xmm0, %%xmm2 \n\t"\
01612         "palignr $8, %%xmm0, %%xmm1 \n\t"\
01613         "palignr $10,%%xmm0, %%xmm5 \n\t"\
01614         "paddw   %%xmm5, %%xmm0     \n\t"\
01615         "paddw   %%xmm3, %%xmm2     \n\t"\
01616         "paddw   %%xmm4, %%xmm1     \n\t"\
01617         "psllw   $2,     %%xmm2     \n\t"\
01618         "movq    (%2),   %%xmm3     \n\t"\
01619         "psubw   %%xmm1, %%xmm2     \n\t"\
01620         "paddw   %5,     %%xmm0     \n\t"\
01621         "pmullw  %%xmm6, %%xmm2     \n\t"\
01622         "paddw   %%xmm0, %%xmm2     \n\t"\
01623         "psraw   $5,     %%xmm2     \n\t"\
01624         "packuswb %%xmm2, %%xmm2    \n\t"\
01625         "pavgb   %%xmm3, %%xmm2     \n\t"\
01626         OP(%%xmm2, (%1), %%xmm4, q)\
01627         "add %4, %0                 \n\t"\
01628         "add %4, %1                 \n\t"\
01629         "add %3, %2                 \n\t"\
01630         : "+a"(src), "+c"(dst), "+d"(src2)\
01631         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01632           "m"(ff_pw_16)\
01633         : "memory"\
01634     );\
01635     }while(--h);\
01636 }\
01637 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01638 \
01639 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01640     int h=8;\
01641     __asm__ volatile(\
01642         "pxor %%xmm7, %%xmm7        \n\t"\
01643         "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
01644         "1:                         \n\t"\
01645         "lddqu   -2(%0), %%xmm1     \n\t"\
01646         "movdqa  %%xmm1, %%xmm0     \n\t"\
01647         "punpckhbw %%xmm7, %%xmm1   \n\t"\
01648         "punpcklbw %%xmm7, %%xmm0   \n\t"\
01649         "movdqa  %%xmm1, %%xmm2     \n\t"\
01650         "movdqa  %%xmm1, %%xmm3     \n\t"\
01651         "movdqa  %%xmm1, %%xmm4     \n\t"\
01652         "movdqa  %%xmm1, %%xmm5     \n\t"\
01653         "palignr $2, %%xmm0, %%xmm4 \n\t"\
01654         "palignr $4, %%xmm0, %%xmm3 \n\t"\
01655         "palignr $6, %%xmm0, %%xmm2 \n\t"\
01656         "palignr $8, %%xmm0, %%xmm1 \n\t"\
01657         "palignr $10,%%xmm0, %%xmm5 \n\t"\
01658         "paddw   %%xmm5, %%xmm0     \n\t"\
01659         "paddw   %%xmm3, %%xmm2     \n\t"\
01660         "paddw   %%xmm4, %%xmm1     \n\t"\
01661         "psllw   $2,     %%xmm2     \n\t"\
01662         "psubw   %%xmm1, %%xmm2     \n\t"\
01663         "paddw   "MANGLE(ff_pw_16)", %%xmm0\n\t"\
01664         "pmullw  %%xmm6, %%xmm2     \n\t"\
01665         "paddw   %%xmm0, %%xmm2     \n\t"\
01666         "psraw   $5,     %%xmm2     \n\t"\
01667         "packuswb %%xmm2, %%xmm2    \n\t"\
01668         OP(%%xmm2, (%1), %%xmm4, q)\
01669         "add %3, %0                 \n\t"\
01670         "add %4, %1                 \n\t"\
01671         "decl %2                    \n\t"\
01672         " jnz 1b                    \n\t"\
01673         : "+a"(src), "+c"(dst), "+g"(h)\
01674         : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
01675         : "memory"\
01676     );\
01677 }\
01678 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01679     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01680     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01681     src += 8*srcStride;\
01682     dst += 8*dstStride;\
01683     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01684     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01685 }\
01686 
01687 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
01688 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01689     src -= 2*srcStride;\
01690     \
01691     __asm__ volatile(\
01692         "pxor %%xmm7, %%xmm7        \n\t"\
01693         "movq (%0), %%xmm0          \n\t"\
01694         "add %2, %0                 \n\t"\
01695         "movq (%0), %%xmm1          \n\t"\
01696         "add %2, %0                 \n\t"\
01697         "movq (%0), %%xmm2          \n\t"\
01698         "add %2, %0                 \n\t"\
01699         "movq (%0), %%xmm3          \n\t"\
01700         "add %2, %0                 \n\t"\
01701         "movq (%0), %%xmm4          \n\t"\
01702         "add %2, %0                 \n\t"\
01703         "punpcklbw %%xmm7, %%xmm0   \n\t"\
01704         "punpcklbw %%xmm7, %%xmm1   \n\t"\
01705         "punpcklbw %%xmm7, %%xmm2   \n\t"\
01706         "punpcklbw %%xmm7, %%xmm3   \n\t"\
01707         "punpcklbw %%xmm7, %%xmm4   \n\t"\
01708         QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01709         QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01710         QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01711         QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01712         QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01713         QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01714         QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01715         QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01716          \
01717         : "+a"(src), "+c"(dst)\
01718         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01719         : "memory"\
01720     );\
01721     if(h==16){\
01722         __asm__ volatile(\
01723             QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01724             QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01725             QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01726             QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01727             QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01728             QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01729             QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01730             QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01731             \
01732             : "+a"(src), "+c"(dst)\
01733             : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01734             : "memory"\
01735         );\
01736     }\
01737 }\
01738 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01739     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
01740 }\
01741 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01742     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
01743     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01744 }
01745 
01746 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
01747     int w = (size+8)>>3;
01748     src -= 2*srcStride+2;
01749     while(w--){
01750         __asm__ volatile(
01751             "pxor %%xmm7, %%xmm7        \n\t"
01752             "movq (%0), %%xmm0          \n\t"
01753             "add %2, %0                 \n\t"
01754             "movq (%0), %%xmm1          \n\t"
01755             "add %2, %0                 \n\t"
01756             "movq (%0), %%xmm2          \n\t"
01757             "add %2, %0                 \n\t"
01758             "movq (%0), %%xmm3          \n\t"
01759             "add %2, %0                 \n\t"
01760             "movq (%0), %%xmm4          \n\t"
01761             "add %2, %0                 \n\t"
01762             "punpcklbw %%xmm7, %%xmm0   \n\t"
01763             "punpcklbw %%xmm7, %%xmm1   \n\t"
01764             "punpcklbw %%xmm7, %%xmm2   \n\t"
01765             "punpcklbw %%xmm7, %%xmm3   \n\t"
01766             "punpcklbw %%xmm7, %%xmm4   \n\t"
01767             QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
01768             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
01769             QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
01770             QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
01771             QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
01772             QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
01773             QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
01774             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
01775             : "+a"(src)
01776             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01777             : "memory"
01778         );
01779         if(size==16){
01780             __asm__ volatile(
01781                 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
01782                 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
01783                 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
01784                 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
01785                 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
01786                 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
01787                 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
01788                 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
01789                 : "+a"(src)
01790                 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01791                 : "memory"
01792             );
01793         }
01794         tmp += 8;
01795         src += 8 - (size+5)*srcStride;
01796     }
01797 }
01798 
01799 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
01800 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01801     int h = size;\
01802     if(size == 16){\
01803         __asm__ volatile(\
01804             "1:                         \n\t"\
01805             "movdqa 32(%0), %%xmm4      \n\t"\
01806             "movdqa 16(%0), %%xmm5      \n\t"\
01807             "movdqa   (%0), %%xmm7      \n\t"\
01808             "movdqa %%xmm4, %%xmm3      \n\t"\
01809             "movdqa %%xmm4, %%xmm2      \n\t"\
01810             "movdqa %%xmm4, %%xmm1      \n\t"\
01811             "movdqa %%xmm4, %%xmm0      \n\t"\
01812             "palignr $10, %%xmm5, %%xmm0 \n\t"\
01813             "palignr  $8, %%xmm5, %%xmm1 \n\t"\
01814             "palignr  $6, %%xmm5, %%xmm2 \n\t"\
01815             "palignr  $4, %%xmm5, %%xmm3 \n\t"\
01816             "palignr  $2, %%xmm5, %%xmm4 \n\t"\
01817             "paddw  %%xmm5, %%xmm0      \n\t"\
01818             "paddw  %%xmm4, %%xmm1      \n\t"\
01819             "paddw  %%xmm3, %%xmm2      \n\t"\
01820             "movdqa %%xmm5, %%xmm6      \n\t"\
01821             "movdqa %%xmm5, %%xmm4      \n\t"\
01822             "movdqa %%xmm5, %%xmm3      \n\t"\
01823             "palignr  $8, %%xmm7, %%xmm4 \n\t"\
01824             "palignr  $2, %%xmm7, %%xmm6 \n\t"\
01825             "palignr $10, %%xmm7, %%xmm3 \n\t"\
01826             "paddw  %%xmm6, %%xmm4      \n\t"\
01827             "movdqa %%xmm5, %%xmm6      \n\t"\
01828             "palignr  $6, %%xmm7, %%xmm5 \n\t"\
01829             "palignr  $4, %%xmm7, %%xmm6 \n\t"\
01830             "paddw  %%xmm7, %%xmm3      \n\t"\
01831             "paddw  %%xmm6, %%xmm5      \n\t"\
01832             \
01833             "psubw  %%xmm1, %%xmm0      \n\t"\
01834             "psubw  %%xmm4, %%xmm3      \n\t"\
01835             "psraw      $2, %%xmm0      \n\t"\
01836             "psraw      $2, %%xmm3      \n\t"\
01837             "psubw  %%xmm1, %%xmm0      \n\t"\
01838             "psubw  %%xmm4, %%xmm3      \n\t"\
01839             "paddw  %%xmm2, %%xmm0      \n\t"\
01840             "paddw  %%xmm5, %%xmm3      \n\t"\
01841             "psraw      $2, %%xmm0      \n\t"\
01842             "psraw      $2, %%xmm3      \n\t"\
01843             "paddw  %%xmm2, %%xmm0      \n\t"\
01844             "paddw  %%xmm5, %%xmm3      \n\t"\
01845             "psraw      $6, %%xmm0      \n\t"\
01846             "psraw      $6, %%xmm3      \n\t"\
01847             "packuswb %%xmm0, %%xmm3    \n\t"\
01848             OP(%%xmm3, (%1), %%xmm7, dqa)\
01849             "add $48, %0                \n\t"\
01850             "add %3, %1                 \n\t"\
01851             "decl %2                    \n\t"\
01852             " jnz 1b                    \n\t"\
01853             : "+a"(tmp), "+c"(dst), "+g"(h)\
01854             : "S"((x86_reg)dstStride)\
01855             : "memory"\
01856         );\
01857     }else{\
01858         __asm__ volatile(\
01859             "1:                         \n\t"\
01860             "movdqa 16(%0), %%xmm1      \n\t"\
01861             "movdqa   (%0), %%xmm0      \n\t"\
01862             "movdqa %%xmm1, %%xmm2      \n\t"\
01863             "movdqa %%xmm1, %%xmm3      \n\t"\
01864             "movdqa %%xmm1, %%xmm4      \n\t"\
01865             "movdqa %%xmm1, %%xmm5      \n\t"\
01866             "palignr $10, %%xmm0, %%xmm5 \n\t"\
01867             "palignr  $8, %%xmm0, %%xmm4 \n\t"\
01868             "palignr  $6, %%xmm0, %%xmm3 \n\t"\
01869             "palignr  $4, %%xmm0, %%xmm2 \n\t"\
01870             "palignr  $2, %%xmm0, %%xmm1 \n\t"\
01871             "paddw  %%xmm5, %%xmm0      \n\t"\
01872             "paddw  %%xmm4, %%xmm1      \n\t"\
01873             "paddw  %%xmm3, %%xmm2      \n\t"\
01874             "psubw  %%xmm1, %%xmm0      \n\t"\
01875             "psraw      $2, %%xmm0      \n\t"\
01876             "psubw  %%xmm1, %%xmm0      \n\t"\
01877             "paddw  %%xmm2, %%xmm0      \n\t"\
01878             "psraw      $2, %%xmm0      \n\t"\
01879             "paddw  %%xmm2, %%xmm0      \n\t"\
01880             "psraw      $6, %%xmm0      \n\t"\
01881             "packuswb %%xmm0, %%xmm0    \n\t"\
01882             OP(%%xmm0, (%1), %%xmm7, q)\
01883             "add $48, %0                \n\t"\
01884             "add %3, %1                 \n\t"\
01885             "decl %2                    \n\t"\
01886             " jnz 1b                    \n\t"\
01887             : "+a"(tmp), "+c"(dst), "+g"(h)\
01888             : "S"((x86_reg)dstStride)\
01889             : "memory"\
01890         );\
01891     }\
01892 }
01893 
01894 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
01895 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01896           put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
01897     OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01898 }\
01899 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01900     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
01901 }\
01902 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01903     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
01904 }\
01905 
01906 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
01907 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
01908 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
01909 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
01910 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
01911 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
01912 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
01913 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
01914 
01915 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
01916 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
01917 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
01918 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
01919 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
01920 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
01921 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
01922 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
01923 
01924 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
01925 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
01926 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
01927 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
01928 
01929 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
01930 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
01931 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
01932 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
01933 
01934 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
01935 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
01936 
01937 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
01938 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
01939 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
01940 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
01941 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
01942 
01943 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01944     put_pixels16_sse2(dst, src, stride, 16);
01945 }
01946 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01947     avg_pixels16_sse2(dst, src, stride, 16);
01948 }
01949 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
01950 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
01951 
01952 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
01953 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01954     OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
01955 }\
01956 
01957 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
01958 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01959     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
01960 }\
01961 \
01962 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01963     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
01964 }\
01965 \
01966 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01967     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
01968 }\
01969 
01970 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
01971 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01972     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01973     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01974     OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
01975 }\
01976 \
01977 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01978     OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
01979 }\
01980 \
01981 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01982     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01983     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01984     OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
01985 }\
01986 
01987 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
01988 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01989     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01990     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01991     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01992 }\
01993 \
01994 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01995     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01996     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01997     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01998 }\
01999 \
02000 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02001     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
02002     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
02003     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
02004 }\
02005 \
02006 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02007     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
02008     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
02009     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
02010 }\
02011 \
02012 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02013     DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
02014     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
02015 }\
02016 \
02017 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02018     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02019     uint8_t * const halfHV= temp;\
02020     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02021     assert(((int)temp & 7) == 0);\
02022     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02023     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
02024 }\
02025 \
02026 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02027     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02028     uint8_t * const halfHV= temp;\
02029     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02030     assert(((int)temp & 7) == 0);\
02031     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02032     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
02033 }\
02034 \
02035 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02036     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02037     uint8_t * const halfHV= temp;\
02038     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02039     assert(((int)temp & 7) == 0);\
02040     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02041     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
02042 }\
02043 \
02044 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02045     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
02046     uint8_t * const halfHV= temp;\
02047     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02048     assert(((int)temp & 7) == 0);\
02049     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02050     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
02051 }\
02052 
02053 #define H264_MC_4816(MMX)\
02054 H264_MC(put_, 4, MMX, 8)\
02055 H264_MC(put_, 8, MMX, 8)\
02056 H264_MC(put_, 16,MMX, 8)\
02057 H264_MC(avg_, 4, MMX, 8)\
02058 H264_MC(avg_, 8, MMX, 8)\
02059 H264_MC(avg_, 16,MMX, 8)\
02060 
02061 #define H264_MC_816(QPEL, XMM)\
02062 QPEL(put_, 8, XMM, 16)\
02063 QPEL(put_, 16,XMM, 16)\
02064 QPEL(avg_, 8, XMM, 16)\
02065 QPEL(avg_, 16,XMM, 16)\
02066 
02067 
02068 #define AVG_3DNOW_OP(a,b,temp, size) \
02069 "mov" #size " " #b ", " #temp "   \n\t"\
02070 "pavgusb " #temp ", " #a "        \n\t"\
02071 "mov" #size " " #a ", " #b "      \n\t"
02072 #define AVG_MMX2_OP(a,b,temp, size) \
02073 "mov" #size " " #b ", " #temp "   \n\t"\
02074 "pavgb " #temp ", " #a "          \n\t"\
02075 "mov" #size " " #a ", " #b "      \n\t"
02076 
02077 #define PAVGB "pavgusb"
02078 QPEL_H264(put_,       PUT_OP, 3dnow)
02079 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
02080 #undef PAVGB
02081 #define PAVGB "pavgb"
02082 QPEL_H264(put_,       PUT_OP, mmx2)
02083 QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
02084 QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
02085 QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
02086 QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
02087 QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
02088 #if HAVE_SSSE3
02089 QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
02090 QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
02091 QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
02092 QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
02093 QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
02094 QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
02095 #endif
02096 #undef PAVGB
02097 
02098 H264_MC_4816(3dnow)
02099 H264_MC_4816(mmx2)
02100 H264_MC_816(H264_MC_V, sse2)
02101 H264_MC_816(H264_MC_HV, sse2)
02102 #if HAVE_SSSE3
02103 H264_MC_816(H264_MC_H, ssse3)
02104 H264_MC_816(H264_MC_HV, ssse3)
02105 #endif
02106 
02107 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
02108 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
02109     0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
02110 };
02111 
02112 #define H264_CHROMA_OP(S,D)
02113 #define H264_CHROMA_OP4(S,D,T)
02114 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
02115 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
02116 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
02117 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02118 #include "dsputil_h264_template_mmx.c"
02119 
02120 static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02121 {
02122     put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02123 }
02124 static void put_vc1_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02125 {
02126     put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
02127 }
02128 static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02129 {
02130     put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02131 }
02132 
02133 #undef H264_CHROMA_OP
02134 #undef H264_CHROMA_OP4
02135 #undef H264_CHROMA_MC8_TMPL
02136 #undef H264_CHROMA_MC4_TMPL
02137 #undef H264_CHROMA_MC2_TMPL
02138 #undef H264_CHROMA_MC8_MV0
02139 
02140 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
02141 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
02142                                "pavgb " #T ", " #D " \n\t"
02143 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
02144 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
02145 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
02146 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02147 #include "dsputil_h264_template_mmx.c"
02148 static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02149 {
02150     avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02151 }
02152 static void avg_vc1_chroma_mc8_mmx2_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02153 {
02154     avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg+2);
02155 }
02156 static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02157 {
02158     avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02159 }
02160 #undef H264_CHROMA_OP
02161 #undef H264_CHROMA_OP4
02162 #undef H264_CHROMA_MC8_TMPL
02163 #undef H264_CHROMA_MC4_TMPL
02164 #undef H264_CHROMA_MC2_TMPL
02165 #undef H264_CHROMA_MC8_MV0
02166 
02167 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
02168 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
02169                                "pavgusb " #T ", " #D " \n\t"
02170 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
02171 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
02172 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
02173 #include "dsputil_h264_template_mmx.c"
02174 static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02175 {
02176     avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02177 }
02178 static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02179 {
02180     avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02181 }
02182 #undef H264_CHROMA_OP
02183 #undef H264_CHROMA_OP4
02184 #undef H264_CHROMA_MC8_TMPL
02185 #undef H264_CHROMA_MC4_TMPL
02186 #undef H264_CHROMA_MC8_MV0
02187 
02188 #if HAVE_SSSE3
02189 #define AVG_OP(X)
02190 #undef H264_CHROMA_MC8_TMPL
02191 #undef H264_CHROMA_MC4_TMPL
02192 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
02193 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
02194 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02195 #include "dsputil_h264_template_ssse3.c"
02196 static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02197 {
02198     put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02199 }
02200 static void put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02201 {
02202     put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
02203 }
02204 
02205 #undef AVG_OP
02206 #undef H264_CHROMA_MC8_TMPL
02207 #undef H264_CHROMA_MC4_TMPL
02208 #undef H264_CHROMA_MC8_MV0
02209 #define AVG_OP(X) X
02210 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
02211 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
02212 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02213 #include "dsputil_h264_template_ssse3.c"
02214 static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02215 {
02216     avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02217 }
02218 static void avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02219 {
02220     avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
02221 }
02222 #undef AVG_OP
02223 #undef H264_CHROMA_MC8_TMPL
02224 #undef H264_CHROMA_MC4_TMPL
02225 #undef H264_CHROMA_MC8_MV0
02226 #endif
02227 
02228 /***********************************/
02229 /* weighted prediction */
02230 
02231 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
02232 {
02233     int x, y;
02234     offset <<= log2_denom;
02235     offset += (1 << log2_denom) >> 1;
02236     __asm__ volatile(
02237         "movd    %0, %%mm4        \n\t"
02238         "movd    %1, %%mm5        \n\t"
02239         "movd    %2, %%mm6        \n\t"
02240         "pshufw  $0, %%mm4, %%mm4 \n\t"
02241         "pshufw  $0, %%mm5, %%mm5 \n\t"
02242         "pxor    %%mm7, %%mm7     \n\t"
02243         :: "g"(weight), "g"(offset), "g"(log2_denom)
02244     );
02245     for(y=0; y<h; y+=2){
02246         for(x=0; x<w; x+=4){
02247             __asm__ volatile(
02248                 "movd      %0,    %%mm0 \n\t"
02249                 "movd      %1,    %%mm1 \n\t"
02250                 "punpcklbw %%mm7, %%mm0 \n\t"
02251                 "punpcklbw %%mm7, %%mm1 \n\t"
02252                 "pmullw    %%mm4, %%mm0 \n\t"
02253                 "pmullw    %%mm4, %%mm1 \n\t"
02254                 "paddsw    %%mm5, %%mm0 \n\t"
02255                 "paddsw    %%mm5, %%mm1 \n\t"
02256                 "psraw     %%mm6, %%mm0 \n\t"
02257                 "psraw     %%mm6, %%mm1 \n\t"
02258                 "packuswb  %%mm7, %%mm0 \n\t"
02259                 "packuswb  %%mm7, %%mm1 \n\t"
02260                 "movd      %%mm0, %0    \n\t"
02261                 "movd      %%mm1, %1    \n\t"
02262                 : "+m"(*(uint32_t*)(dst+x)),
02263                   "+m"(*(uint32_t*)(dst+x+stride))
02264             );
02265         }
02266         dst += 2*stride;
02267     }
02268 }
02269 
02270 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
02271 {
02272     int x, y;
02273     offset = ((offset + 1) | 1) << log2_denom;
02274     __asm__ volatile(
02275         "movd    %0, %%mm3        \n\t"
02276         "movd    %1, %%mm4        \n\t"
02277         "movd    %2, %%mm5        \n\t"
02278         "movd    %3, %%mm6        \n\t"
02279         "pshufw  $0, %%mm3, %%mm3 \n\t"
02280         "pshufw  $0, %%mm4, %%mm4 \n\t"
02281         "pshufw  $0, %%mm5, %%mm5 \n\t"
02282         "pxor    %%mm7, %%mm7     \n\t"
02283         :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
02284     );
02285     for(y=0; y<h; y++){
02286         for(x=0; x<w; x+=4){
02287             __asm__ volatile(
02288                 "movd      %0,    %%mm0 \n\t"
02289                 "movd      %1,    %%mm1 \n\t"
02290                 "punpcklbw %%mm7, %%mm0 \n\t"
02291                 "punpcklbw %%mm7, %%mm1 \n\t"
02292                 "pmullw    %%mm3, %%mm0 \n\t"
02293                 "pmullw    %%mm4, %%mm1 \n\t"
02294                 "paddsw    %%mm1, %%mm0 \n\t"
02295                 "paddsw    %%mm5, %%mm0 \n\t"
02296                 "psraw     %%mm6, %%mm0 \n\t"
02297                 "packuswb  %%mm0, %%mm0 \n\t"
02298                 "movd      %%mm0, %0    \n\t"
02299                 : "+m"(*(uint32_t*)(dst+x))
02300                 :  "m"(*(uint32_t*)(src+x))
02301             );
02302         }
02303         src += stride;
02304         dst += stride;
02305     }
02306 }
02307 
02308 #define H264_WEIGHT(W,H) \
02309 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
02310     ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
02311 } \
02312 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
02313     ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
02314 }
02315 
02316 H264_WEIGHT(16,16)
02317 H264_WEIGHT(16, 8)
02318 H264_WEIGHT( 8,16)
02319 H264_WEIGHT( 8, 8)
02320 H264_WEIGHT( 8, 4)
02321 H264_WEIGHT( 4, 8)
02322 H264_WEIGHT( 4, 4)
02323 H264_WEIGHT( 4, 2)
02324 

Generated on Fri Sep 16 2011 17:17:46 for FFmpeg by  doxygen 1.7.1