• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/snowdsp_mmx.c

Go to the documentation of this file.
00001 /*
00002  * MMX and SSE2 optimized snow DSP utils
00003  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/avcodec.h"
00024 #include "libavcodec/snow.h"
00025 #include "libavcodec/dwt.h"
00026 #include "dsputil_mmx.h"
00027 
00028 static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){
00029     const int w2= (width+1)>>1;
00030     DECLARE_ALIGNED(16, IDWTELEM, temp)[width>>1];
00031     const int w_l= (width>>1);
00032     const int w_r= w2 - 1;
00033     int i;
00034 
00035     { // Lift 0
00036         IDWTELEM * const ref = b + w2 - 1;
00037         IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
00038         // (the first time erroneously), we allow the SSE2 code to run an extra pass.
00039         // The savings in code and time are well worth having to store this value and
00040         // calculate b[0] correctly afterwards.
00041 
00042         i = 0;
00043         __asm__ volatile(
00044             "pcmpeqd   %%xmm7, %%xmm7         \n\t"
00045             "pcmpeqd   %%xmm3, %%xmm3         \n\t"
00046             "psllw         $1, %%xmm3         \n\t"
00047             "paddw     %%xmm7, %%xmm3         \n\t"
00048             "psllw        $13, %%xmm3         \n\t"
00049         ::);
00050         for(; i<w_l-15; i+=16){
00051             __asm__ volatile(
00052                 "movdqu   (%1), %%xmm1        \n\t"
00053                 "movdqu 16(%1), %%xmm5        \n\t"
00054                 "movdqu  2(%1), %%xmm2        \n\t"
00055                 "movdqu 18(%1), %%xmm6        \n\t"
00056                 "paddw  %%xmm1, %%xmm2        \n\t"
00057                 "paddw  %%xmm5, %%xmm6        \n\t"
00058                 "paddw  %%xmm7, %%xmm2        \n\t"
00059                 "paddw  %%xmm7, %%xmm6        \n\t"
00060                 "pmulhw %%xmm3, %%xmm2        \n\t"
00061                 "pmulhw %%xmm3, %%xmm6        \n\t"
00062                 "paddw    (%0), %%xmm2        \n\t"
00063                 "paddw  16(%0), %%xmm6        \n\t"
00064                 "movdqa %%xmm2, (%0)          \n\t"
00065                 "movdqa %%xmm6, 16(%0)        \n\t"
00066                 :: "r"(&b[i]), "r"(&ref[i])
00067                 : "memory"
00068             );
00069         }
00070         snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
00071         b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
00072     }
00073 
00074     { // Lift 1
00075         IDWTELEM * const dst = b+w2;
00076 
00077         i = 0;
00078         for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
00079             dst[i] = dst[i] - (b[i] + b[i + 1]);
00080         }
00081         for(; i<w_r-15; i+=16){
00082             __asm__ volatile(
00083                 "movdqu   (%1), %%xmm1        \n\t"
00084                 "movdqu 16(%1), %%xmm5        \n\t"
00085                 "movdqu  2(%1), %%xmm2        \n\t"
00086                 "movdqu 18(%1), %%xmm6        \n\t"
00087                 "paddw  %%xmm1, %%xmm2        \n\t"
00088                 "paddw  %%xmm5, %%xmm6        \n\t"
00089                 "movdqa   (%0), %%xmm0        \n\t"
00090                 "movdqa 16(%0), %%xmm4        \n\t"
00091                 "psubw  %%xmm2, %%xmm0        \n\t"
00092                 "psubw  %%xmm6, %%xmm4        \n\t"
00093                 "movdqa %%xmm0, (%0)          \n\t"
00094                 "movdqa %%xmm4, 16(%0)        \n\t"
00095                 :: "r"(&dst[i]), "r"(&b[i])
00096                 : "memory"
00097             );
00098         }
00099         snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
00100     }
00101 
00102     { // Lift 2
00103         IDWTELEM * const ref = b+w2 - 1;
00104         IDWTELEM b_0 = b[0];
00105 
00106         i = 0;
00107         __asm__ volatile(
00108             "psllw         $15, %%xmm7        \n\t"
00109             "pcmpeqw    %%xmm6, %%xmm6        \n\t"
00110             "psrlw         $13, %%xmm6        \n\t"
00111             "paddw      %%xmm7, %%xmm6        \n\t"
00112         ::);
00113         for(; i<w_l-15; i+=16){
00114             __asm__ volatile(
00115                 "movdqu   (%1), %%xmm0        \n\t"
00116                 "movdqu 16(%1), %%xmm4        \n\t"
00117                 "movdqu  2(%1), %%xmm1        \n\t"
00118                 "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
00119                 "paddw  %%xmm6, %%xmm0        \n\t"
00120                 "paddw  %%xmm6, %%xmm4        \n\t"
00121                 "paddw  %%xmm7, %%xmm1        \n\t"
00122                 "paddw  %%xmm7, %%xmm5        \n\t"
00123                 "pavgw  %%xmm1, %%xmm0        \n\t"
00124                 "pavgw  %%xmm5, %%xmm4        \n\t"
00125                 "psubw  %%xmm7, %%xmm0        \n\t"
00126                 "psubw  %%xmm7, %%xmm4        \n\t"
00127                 "psraw      $1, %%xmm0        \n\t"
00128                 "psraw      $1, %%xmm4        \n\t"
00129                 "movdqa   (%0), %%xmm1        \n\t"
00130                 "movdqa 16(%0), %%xmm5        \n\t"
00131                 "paddw  %%xmm1, %%xmm0        \n\t"
00132                 "paddw  %%xmm5, %%xmm4        \n\t"
00133                 "psraw      $2, %%xmm0        \n\t"
00134                 "psraw      $2, %%xmm4        \n\t"
00135                 "paddw  %%xmm1, %%xmm0        \n\t"
00136                 "paddw  %%xmm5, %%xmm4        \n\t"
00137                 "movdqa %%xmm0, (%0)          \n\t"
00138                 "movdqa %%xmm4, 16(%0)        \n\t"
00139                 :: "r"(&b[i]), "r"(&ref[i])
00140                 : "memory"
00141             );
00142         }
00143         snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
00144         b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
00145     }
00146 
00147     { // Lift 3
00148         IDWTELEM * const src = b+w2;
00149 
00150         i = 0;
00151         for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
00152             temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
00153         }
00154         for(; i<w_r-7; i+=8){
00155             __asm__ volatile(
00156                 "movdqu  2(%1), %%xmm2        \n\t"
00157                 "movdqu 18(%1), %%xmm6        \n\t"
00158                 "paddw    (%1), %%xmm2        \n\t"
00159                 "paddw  16(%1), %%xmm6        \n\t"
00160                 "movdqu   (%0), %%xmm0        \n\t"
00161                 "movdqu 16(%0), %%xmm4        \n\t"
00162                 "paddw  %%xmm2, %%xmm0        \n\t"
00163                 "paddw  %%xmm6, %%xmm4        \n\t"
00164                 "psraw      $1, %%xmm2        \n\t"
00165                 "psraw      $1, %%xmm6        \n\t"
00166                 "paddw  %%xmm0, %%xmm2        \n\t"
00167                 "paddw  %%xmm4, %%xmm6        \n\t"
00168                 "movdqa %%xmm2, (%2)          \n\t"
00169                 "movdqa %%xmm6, 16(%2)        \n\t"
00170                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
00171                  : "memory"
00172                );
00173         }
00174         snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
00175     }
00176 
00177     {
00178         snow_interleave_line_header(&i, width, b, temp);
00179 
00180         for (; (i & 0x3E) != 0x3E; i-=2){
00181             b[i+1] = temp[i>>1];
00182             b[i] = b[i>>1];
00183         }
00184         for (i-=62; i>=0; i-=64){
00185             __asm__ volatile(
00186                 "movdqa      (%1), %%xmm0       \n\t"
00187                 "movdqa    16(%1), %%xmm2       \n\t"
00188                 "movdqa    32(%1), %%xmm4       \n\t"
00189                 "movdqa    48(%1), %%xmm6       \n\t"
00190                 "movdqa      (%1), %%xmm1       \n\t"
00191                 "movdqa    16(%1), %%xmm3       \n\t"
00192                 "movdqa    32(%1), %%xmm5       \n\t"
00193                 "movdqa    48(%1), %%xmm7       \n\t"
00194                 "punpcklwd   (%2), %%xmm0       \n\t"
00195                 "punpcklwd 16(%2), %%xmm2       \n\t"
00196                 "punpcklwd 32(%2), %%xmm4       \n\t"
00197                 "punpcklwd 48(%2), %%xmm6       \n\t"
00198                 "movdqa    %%xmm0, (%0)         \n\t"
00199                 "movdqa    %%xmm2, 32(%0)       \n\t"
00200                 "movdqa    %%xmm4, 64(%0)       \n\t"
00201                 "movdqa    %%xmm6, 96(%0)       \n\t"
00202                 "punpckhwd   (%2), %%xmm1       \n\t"
00203                 "punpckhwd 16(%2), %%xmm3       \n\t"
00204                 "punpckhwd 32(%2), %%xmm5       \n\t"
00205                 "punpckhwd 48(%2), %%xmm7       \n\t"
00206                 "movdqa    %%xmm1, 16(%0)       \n\t"
00207                 "movdqa    %%xmm3, 48(%0)       \n\t"
00208                 "movdqa    %%xmm5, 80(%0)       \n\t"
00209                 "movdqa    %%xmm7, 112(%0)      \n\t"
00210                 :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
00211                  : "memory"
00212                );
00213         }
00214     }
00215 }
00216 
00217 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){
00218     const int w2= (width+1)>>1;
00219     IDWTELEM temp[width >> 1];
00220     const int w_l= (width>>1);
00221     const int w_r= w2 - 1;
00222     int i;
00223 
00224     { // Lift 0
00225         IDWTELEM * const ref = b + w2 - 1;
00226 
00227         i = 1;
00228         b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
00229         __asm__ volatile(
00230             "pcmpeqw    %%mm7, %%mm7         \n\t"
00231             "pcmpeqw    %%mm3, %%mm3         \n\t"
00232             "psllw         $1, %%mm3         \n\t"
00233             "paddw      %%mm7, %%mm3         \n\t"
00234             "psllw        $13, %%mm3         \n\t"
00235            ::);
00236         for(; i<w_l-7; i+=8){
00237             __asm__ volatile(
00238                 "movq     (%1), %%mm2        \n\t"
00239                 "movq    8(%1), %%mm6        \n\t"
00240                 "paddw   2(%1), %%mm2        \n\t"
00241                 "paddw  10(%1), %%mm6        \n\t"
00242                 "paddw   %%mm7, %%mm2        \n\t"
00243                 "paddw   %%mm7, %%mm6        \n\t"
00244                 "pmulhw  %%mm3, %%mm2        \n\t"
00245                 "pmulhw  %%mm3, %%mm6        \n\t"
00246                 "paddw    (%0), %%mm2        \n\t"
00247                 "paddw   8(%0), %%mm6        \n\t"
00248                 "movq    %%mm2, (%0)         \n\t"
00249                 "movq    %%mm6, 8(%0)        \n\t"
00250                 :: "r"(&b[i]), "r"(&ref[i])
00251                  : "memory"
00252                );
00253         }
00254         snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
00255     }
00256 
00257     { // Lift 1
00258         IDWTELEM * const dst = b+w2;
00259 
00260         i = 0;
00261         for(; i<w_r-7; i+=8){
00262             __asm__ volatile(
00263                 "movq     (%1), %%mm2        \n\t"
00264                 "movq    8(%1), %%mm6        \n\t"
00265                 "paddw   2(%1), %%mm2        \n\t"
00266                 "paddw  10(%1), %%mm6        \n\t"
00267                 "movq     (%0), %%mm0        \n\t"
00268                 "movq    8(%0), %%mm4        \n\t"
00269                 "psubw   %%mm2, %%mm0        \n\t"
00270                 "psubw   %%mm6, %%mm4        \n\t"
00271                 "movq    %%mm0, (%0)         \n\t"
00272                 "movq    %%mm4, 8(%0)        \n\t"
00273                 :: "r"(&dst[i]), "r"(&b[i])
00274                  : "memory"
00275                );
00276         }
00277         snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
00278     }
00279 
00280     { // Lift 2
00281         IDWTELEM * const ref = b+w2 - 1;
00282 
00283         i = 1;
00284         b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
00285         __asm__ volatile(
00286             "psllw         $15, %%mm7        \n\t"
00287             "pcmpeqw     %%mm6, %%mm6        \n\t"
00288             "psrlw         $13, %%mm6        \n\t"
00289             "paddw       %%mm7, %%mm6        \n\t"
00290            ::);
00291         for(; i<w_l-7; i+=8){
00292             __asm__ volatile(
00293                 "movq     (%1), %%mm0        \n\t"
00294                 "movq    8(%1), %%mm4        \n\t"
00295                 "movq    2(%1), %%mm1        \n\t"
00296                 "movq   10(%1), %%mm5        \n\t"
00297                 "paddw   %%mm6, %%mm0        \n\t"
00298                 "paddw   %%mm6, %%mm4        \n\t"
00299                 "paddw   %%mm7, %%mm1        \n\t"
00300                 "paddw   %%mm7, %%mm5        \n\t"
00301                 "pavgw   %%mm1, %%mm0        \n\t"
00302                 "pavgw   %%mm5, %%mm4        \n\t"
00303                 "psubw   %%mm7, %%mm0        \n\t"
00304                 "psubw   %%mm7, %%mm4        \n\t"
00305                 "psraw      $1, %%mm0        \n\t"
00306                 "psraw      $1, %%mm4        \n\t"
00307                 "movq     (%0), %%mm1        \n\t"
00308                 "movq    8(%0), %%mm5        \n\t"
00309                 "paddw   %%mm1, %%mm0        \n\t"
00310                 "paddw   %%mm5, %%mm4        \n\t"
00311                 "psraw      $2, %%mm0        \n\t"
00312                 "psraw      $2, %%mm4        \n\t"
00313                 "paddw   %%mm1, %%mm0        \n\t"
00314                 "paddw   %%mm5, %%mm4        \n\t"
00315                 "movq    %%mm0, (%0)         \n\t"
00316                 "movq    %%mm4, 8(%0)        \n\t"
00317                 :: "r"(&b[i]), "r"(&ref[i])
00318                  : "memory"
00319                );
00320         }
00321         snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
00322     }
00323 
00324     { // Lift 3
00325         IDWTELEM * const src = b+w2;
00326         i = 0;
00327 
00328         for(; i<w_r-7; i+=8){
00329             __asm__ volatile(
00330                 "movq    2(%1), %%mm2        \n\t"
00331                 "movq   10(%1), %%mm6        \n\t"
00332                 "paddw    (%1), %%mm2        \n\t"
00333                 "paddw   8(%1), %%mm6        \n\t"
00334                 "movq     (%0), %%mm0        \n\t"
00335                 "movq    8(%0), %%mm4        \n\t"
00336                 "paddw   %%mm2, %%mm0        \n\t"
00337                 "paddw   %%mm6, %%mm4        \n\t"
00338                 "psraw      $1, %%mm2        \n\t"
00339                 "psraw      $1, %%mm6        \n\t"
00340                 "paddw   %%mm0, %%mm2        \n\t"
00341                 "paddw   %%mm4, %%mm6        \n\t"
00342                 "movq    %%mm2, (%2)         \n\t"
00343                 "movq    %%mm6, 8(%2)        \n\t"
00344                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
00345                  : "memory"
00346                );
00347         }
00348         snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
00349     }
00350 
00351     {
00352         snow_interleave_line_header(&i, width, b, temp);
00353 
00354         for (; (i & 0x1E) != 0x1E; i-=2){
00355             b[i+1] = temp[i>>1];
00356             b[i] = b[i>>1];
00357         }
00358         for (i-=30; i>=0; i-=32){
00359             __asm__ volatile(
00360                 "movq        (%1), %%mm0       \n\t"
00361                 "movq       8(%1), %%mm2       \n\t"
00362                 "movq      16(%1), %%mm4       \n\t"
00363                 "movq      24(%1), %%mm6       \n\t"
00364                 "movq        (%1), %%mm1       \n\t"
00365                 "movq       8(%1), %%mm3       \n\t"
00366                 "movq      16(%1), %%mm5       \n\t"
00367                 "movq      24(%1), %%mm7       \n\t"
00368                 "punpcklwd   (%2), %%mm0       \n\t"
00369                 "punpcklwd  8(%2), %%mm2       \n\t"
00370                 "punpcklwd 16(%2), %%mm4       \n\t"
00371                 "punpcklwd 24(%2), %%mm6       \n\t"
00372                 "movq       %%mm0, (%0)        \n\t"
00373                 "movq       %%mm2, 16(%0)      \n\t"
00374                 "movq       %%mm4, 32(%0)      \n\t"
00375                 "movq       %%mm6, 48(%0)      \n\t"
00376                 "punpckhwd   (%2), %%mm1       \n\t"
00377                 "punpckhwd  8(%2), %%mm3       \n\t"
00378                 "punpckhwd 16(%2), %%mm5       \n\t"
00379                 "punpckhwd 24(%2), %%mm7       \n\t"
00380                 "movq       %%mm1, 8(%0)       \n\t"
00381                 "movq       %%mm3, 24(%0)      \n\t"
00382                 "movq       %%mm5, 40(%0)      \n\t"
00383                 "movq       %%mm7, 56(%0)      \n\t"
00384                 :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
00385                  : "memory"
00386                );
00387         }
00388     }
00389 }
00390 
00391 #if HAVE_7REGS
00392 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
00393         ""op" ("r",%%"REG_d"), %%"t0"      \n\t"\
00394         ""op" 16("r",%%"REG_d"), %%"t1"    \n\t"\
00395         ""op" 32("r",%%"REG_d"), %%"t2"    \n\t"\
00396         ""op" 48("r",%%"REG_d"), %%"t3"    \n\t"
00397 
00398 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
00399         snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
00400 
00401 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
00402         snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
00403 
00404 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
00405         "psubw %%"s0", %%"t0" \n\t"\
00406         "psubw %%"s1", %%"t1" \n\t"\
00407         "psubw %%"s2", %%"t2" \n\t"\
00408         "psubw %%"s3", %%"t3" \n\t"
00409 
00410 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
00411         "movdqa %%"s0", ("w",%%"REG_d")      \n\t"\
00412         "movdqa %%"s1", 16("w",%%"REG_d")    \n\t"\
00413         "movdqa %%"s2", 32("w",%%"REG_d")    \n\t"\
00414         "movdqa %%"s3", 48("w",%%"REG_d")    \n\t"
00415 
00416 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
00417         "psraw $"n", %%"t0" \n\t"\
00418         "psraw $"n", %%"t1" \n\t"\
00419         "psraw $"n", %%"t2" \n\t"\
00420         "psraw $"n", %%"t3" \n\t"
00421 
00422 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
00423         "paddw %%"s0", %%"t0" \n\t"\
00424         "paddw %%"s1", %%"t1" \n\t"\
00425         "paddw %%"s2", %%"t2" \n\t"\
00426         "paddw %%"s3", %%"t3" \n\t"
00427 
00428 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
00429         "pmulhw %%"s0", %%"t0" \n\t"\
00430         "pmulhw %%"s1", %%"t1" \n\t"\
00431         "pmulhw %%"s2", %%"t2" \n\t"\
00432         "pmulhw %%"s3", %%"t3" \n\t"
00433 
00434 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
00435         "movdqa %%"s0", %%"t0" \n\t"\
00436         "movdqa %%"s1", %%"t1" \n\t"\
00437         "movdqa %%"s2", %%"t2" \n\t"\
00438         "movdqa %%"s3", %%"t3" \n\t"
00439 
00440 static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
00441     x86_reg i = width;
00442 
00443     while(i & 0x1F)
00444     {
00445         i--;
00446         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
00447         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
00448         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
00449         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
00450     }
00451     i+=i;
00452 
00453          __asm__ volatile (
00454         "jmp 2f                                      \n\t"
00455         "1:                                          \n\t"
00456         snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
00457         snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
00458 
00459 
00460         "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
00461         "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
00462         "paddw      %%xmm2, %%xmm2                   \n\t"
00463         "paddw      %%xmm0, %%xmm2                   \n\t"
00464         "psllw         $13, %%xmm2                   \n\t"
00465         snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
00466         snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
00467         snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
00468         snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
00469         snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
00470         snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
00471         snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
00472         snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
00473 
00474         "pcmpeqw %%xmm7, %%xmm7                      \n\t"
00475         "pcmpeqw %%xmm5, %%xmm5                      \n\t"
00476         "psllw $15, %%xmm7                           \n\t"
00477         "psrlw $13, %%xmm5                           \n\t"
00478         "paddw %%xmm7, %%xmm5                        \n\t"
00479         snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
00480         "movq   (%2,%%"REG_d"), %%xmm1        \n\t"
00481         "movq  8(%2,%%"REG_d"), %%xmm3        \n\t"
00482         "paddw %%xmm7, %%xmm1                        \n\t"
00483         "paddw %%xmm7, %%xmm3                        \n\t"
00484         "pavgw %%xmm1, %%xmm0                        \n\t"
00485         "pavgw %%xmm3, %%xmm2                        \n\t"
00486         "movq 16(%2,%%"REG_d"), %%xmm1        \n\t"
00487         "movq 24(%2,%%"REG_d"), %%xmm3        \n\t"
00488         "paddw %%xmm7, %%xmm1                        \n\t"
00489         "paddw %%xmm7, %%xmm3                        \n\t"
00490         "pavgw %%xmm1, %%xmm4                        \n\t"
00491         "pavgw %%xmm3, %%xmm6                        \n\t"
00492         snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
00493         snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
00494         snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
00495 
00496         snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
00497         snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
00498         snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
00499         snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
00500         snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
00501         snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
00502         snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
00503         snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
00504         snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
00505 
00506         "2:                                          \n\t"
00507         "sub $64, %%"REG_d"                          \n\t"
00508         "jge 1b                                      \n\t"
00509         :"+d"(i)
00510         :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
00511 }
00512 
00513 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
00514         ""op" ("r",%%"REG_d"), %%"t0"   \n\t"\
00515         ""op" 8("r",%%"REG_d"), %%"t1"  \n\t"\
00516         ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
00517         ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
00518 
00519 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
00520         snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
00521 
00522 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
00523         snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
00524 
00525 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
00526         "movq %%"s0", ("w",%%"REG_d")   \n\t"\
00527         "movq %%"s1", 8("w",%%"REG_d")  \n\t"\
00528         "movq %%"s2", 16("w",%%"REG_d") \n\t"\
00529         "movq %%"s3", 24("w",%%"REG_d") \n\t"
00530 
00531 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
00532         "movq %%"s0", %%"t0" \n\t"\
00533         "movq %%"s1", %%"t1" \n\t"\
00534         "movq %%"s2", %%"t2" \n\t"\
00535         "movq %%"s3", %%"t3" \n\t"
00536 
00537 
00538 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
00539     x86_reg i = width;
00540     while(i & 15)
00541     {
00542         i--;
00543         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
00544         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
00545         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
00546         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
00547     }
00548     i+=i;
00549     __asm__ volatile(
00550         "jmp 2f                                      \n\t"
00551         "1:                                          \n\t"
00552 
00553         snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
00554         snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
00555         "pcmpeqw    %%mm0, %%mm0                     \n\t"
00556         "pcmpeqw    %%mm2, %%mm2                     \n\t"
00557         "paddw      %%mm2, %%mm2                     \n\t"
00558         "paddw      %%mm0, %%mm2                     \n\t"
00559         "psllw        $13, %%mm2                     \n\t"
00560         snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
00561         snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
00562         snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
00563         snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
00564         snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
00565         snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
00566         snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
00567         snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
00568         "pcmpeqw %%mm7, %%mm7                        \n\t"
00569         "pcmpeqw %%mm5, %%mm5                        \n\t"
00570         "psllw $15, %%mm7                            \n\t"
00571         "psrlw $13, %%mm5                            \n\t"
00572         "paddw %%mm7, %%mm5                          \n\t"
00573         snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
00574         "movq   (%2,%%"REG_d"), %%mm1         \n\t"
00575         "movq  8(%2,%%"REG_d"), %%mm3         \n\t"
00576         "paddw %%mm7, %%mm1                          \n\t"
00577         "paddw %%mm7, %%mm3                          \n\t"
00578         "pavgw %%mm1, %%mm0                          \n\t"
00579         "pavgw %%mm3, %%mm2                          \n\t"
00580         "movq 16(%2,%%"REG_d"), %%mm1         \n\t"
00581         "movq 24(%2,%%"REG_d"), %%mm3         \n\t"
00582         "paddw %%mm7, %%mm1                          \n\t"
00583         "paddw %%mm7, %%mm3                          \n\t"
00584         "pavgw %%mm1, %%mm4                          \n\t"
00585         "pavgw %%mm3, %%mm6                          \n\t"
00586         snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
00587         snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
00588         snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
00589 
00590         snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
00591         snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
00592         snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
00593         snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
00594         snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
00595         snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
00596         snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
00597         snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
00598         snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
00599 
00600         "2:                                          \n\t"
00601         "sub $32, %%"REG_d"                          \n\t"
00602         "jge 1b                                      \n\t"
00603         :"+d"(i)
00604         :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
00605 }
00606 #endif //HAVE_7REGS
00607 
00608 #define snow_inner_add_yblock_sse2_header \
00609     IDWTELEM * * dst_array = sb->line + src_y;\
00610     x86_reg tmp;\
00611     __asm__ volatile(\
00612              "mov  %7, %%"REG_c"             \n\t"\
00613              "mov  %6, %2                    \n\t"\
00614              "mov  %4, %%"REG_S"             \n\t"\
00615              "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
00616              "pcmpeqd %%xmm3, %%xmm3         \n\t"\
00617              "psllw $15, %%xmm3              \n\t"\
00618              "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
00619              "1:                             \n\t"\
00620              "mov %1, %%"REG_D"              \n\t"\
00621              "mov (%%"REG_D"), %%"REG_D"     \n\t"\
00622              "add %3, %%"REG_D"              \n\t"
00623 
00624 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
00625              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
00626              "movq (%%"REG_d"), %%"out_reg1" \n\t"\
00627              "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
00628              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
00629              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
00630              "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
00631              "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
00632              "punpcklbw %%xmm7, %%xmm0       \n\t"\
00633              "punpcklbw %%xmm7, %%xmm4       \n\t"\
00634              "pmullw %%xmm0, %%"out_reg1"    \n\t"\
00635              "pmullw %%xmm4, %%"out_reg2"    \n\t"
00636 
00637 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
00638              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
00639              "movq (%%"REG_d"), %%"out_reg1" \n\t"\
00640              "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
00641              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
00642              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
00643              "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
00644              "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
00645              "punpcklbw %%xmm7, %%xmm0       \n\t"\
00646              "punpcklbw %%xmm7, %%xmm4       \n\t"\
00647              "pmullw %%xmm0, %%"out_reg1"    \n\t"\
00648              "pmullw %%xmm4, %%"out_reg2"    \n\t"
00649 
00650 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
00651              snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
00652              "paddusw %%xmm2, %%xmm1         \n\t"\
00653              "paddusw %%xmm6, %%xmm5         \n\t"
00654 
00655 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
00656              snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
00657              "paddusw %%xmm2, %%xmm1         \n\t"\
00658              "paddusw %%xmm6, %%xmm5         \n\t"
00659 
00660 #define snow_inner_add_yblock_sse2_end_common1\
00661              "add $32, %%"REG_S"             \n\t"\
00662              "add %%"REG_c", %0              \n\t"\
00663              "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
00664              "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
00665              "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
00666              "add %%"REG_c", (%%"REG_a")     \n\t"
00667 
00668 #define snow_inner_add_yblock_sse2_end_common2\
00669              "jnz 1b                         \n\t"\
00670              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
00671              :\
00672              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
00673              "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
00674 
00675 #define snow_inner_add_yblock_sse2_end_8\
00676              "sal $1, %%"REG_c"              \n\t"\
00677              "add $"PTR_SIZE"*2, %1          \n\t"\
00678              snow_inner_add_yblock_sse2_end_common1\
00679              "sar $1, %%"REG_c"              \n\t"\
00680              "sub $2, %2                     \n\t"\
00681              snow_inner_add_yblock_sse2_end_common2
00682 
00683 #define snow_inner_add_yblock_sse2_end_16\
00684              "add $"PTR_SIZE"*1, %1          \n\t"\
00685              snow_inner_add_yblock_sse2_end_common1\
00686              "dec %2                         \n\t"\
00687              snow_inner_add_yblock_sse2_end_common2
00688 
00689 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
00690                       int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
00691 snow_inner_add_yblock_sse2_header
00692 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
00693 snow_inner_add_yblock_sse2_accum_8("2", "8")
00694 snow_inner_add_yblock_sse2_accum_8("1", "128")
00695 snow_inner_add_yblock_sse2_accum_8("0", "136")
00696 
00697              "mov %0, %%"REG_d"              \n\t"
00698              "movdqa (%%"REG_D"), %%xmm0     \n\t"
00699              "movdqa %%xmm1, %%xmm2          \n\t"
00700 
00701              "punpckhwd %%xmm7, %%xmm1       \n\t"
00702              "punpcklwd %%xmm7, %%xmm2       \n\t"
00703              "paddd %%xmm2, %%xmm0           \n\t"
00704              "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
00705              "paddd %%xmm1, %%xmm2           \n\t"
00706              "paddd %%xmm3, %%xmm0           \n\t"
00707              "paddd %%xmm3, %%xmm2           \n\t"
00708 
00709              "mov %1, %%"REG_D"              \n\t"
00710              "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
00711              "add %3, %%"REG_D"              \n\t"
00712 
00713              "movdqa (%%"REG_D"), %%xmm4     \n\t"
00714              "movdqa %%xmm5, %%xmm6          \n\t"
00715              "punpckhwd %%xmm7, %%xmm5       \n\t"
00716              "punpcklwd %%xmm7, %%xmm6       \n\t"
00717              "paddd %%xmm6, %%xmm4           \n\t"
00718              "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
00719              "paddd %%xmm5, %%xmm6           \n\t"
00720              "paddd %%xmm3, %%xmm4           \n\t"
00721              "paddd %%xmm3, %%xmm6           \n\t"
00722 
00723              "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
00724              "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
00725              "packssdw %%xmm2, %%xmm0        \n\t"
00726              "packuswb %%xmm7, %%xmm0        \n\t"
00727              "movq %%xmm0, (%%"REG_d")       \n\t"
00728 
00729              "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
00730              "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
00731              "packssdw %%xmm6, %%xmm4        \n\t"
00732              "packuswb %%xmm7, %%xmm4        \n\t"
00733              "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
00734 snow_inner_add_yblock_sse2_end_8
00735 }
00736 
00737 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
00738                       int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
00739 snow_inner_add_yblock_sse2_header
00740 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
00741 snow_inner_add_yblock_sse2_accum_16("2", "16")
00742 snow_inner_add_yblock_sse2_accum_16("1", "512")
00743 snow_inner_add_yblock_sse2_accum_16("0", "528")
00744 
00745              "mov %0, %%"REG_d"              \n\t"
00746              "psrlw $4, %%xmm1               \n\t"
00747              "psrlw $4, %%xmm5               \n\t"
00748              "paddw   (%%"REG_D"), %%xmm1    \n\t"
00749              "paddw 16(%%"REG_D"), %%xmm5    \n\t"
00750              "paddw %%xmm3, %%xmm1           \n\t"
00751              "paddw %%xmm3, %%xmm5           \n\t"
00752              "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
00753              "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
00754              "packuswb %%xmm5, %%xmm1        \n\t"
00755 
00756              "movdqu %%xmm1, (%%"REG_d")       \n\t"
00757 
00758 snow_inner_add_yblock_sse2_end_16
00759 }
00760 
00761 #define snow_inner_add_yblock_mmx_header \
00762     IDWTELEM * * dst_array = sb->line + src_y;\
00763     x86_reg tmp;\
00764     __asm__ volatile(\
00765              "mov  %7, %%"REG_c"             \n\t"\
00766              "mov  %6, %2                    \n\t"\
00767              "mov  %4, %%"REG_S"             \n\t"\
00768              "pxor %%mm7, %%mm7              \n\t" /* 0 */\
00769              "pcmpeqd %%mm3, %%mm3           \n\t"\
00770              "psllw $15, %%mm3               \n\t"\
00771              "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
00772              "1:                             \n\t"\
00773              "mov %1, %%"REG_D"              \n\t"\
00774              "mov (%%"REG_D"), %%"REG_D"     \n\t"\
00775              "add %3, %%"REG_D"              \n\t"
00776 
00777 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
00778              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
00779              "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
00780              "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
00781              "punpcklbw %%mm7, %%"out_reg1" \n\t"\
00782              "punpcklbw %%mm7, %%"out_reg2" \n\t"\
00783              "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
00784              "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
00785              "punpcklbw %%mm7, %%mm0       \n\t"\
00786              "punpcklbw %%mm7, %%mm4       \n\t"\
00787              "pmullw %%mm0, %%"out_reg1"    \n\t"\
00788              "pmullw %%mm4, %%"out_reg2"    \n\t"
00789 
00790 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
00791              snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
00792              "paddusw %%mm2, %%mm1         \n\t"\
00793              "paddusw %%mm6, %%mm5         \n\t"
00794 
00795 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
00796              "mov %0, %%"REG_d"              \n\t"\
00797              "psrlw $4, %%mm1                \n\t"\
00798              "psrlw $4, %%mm5                \n\t"\
00799              "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
00800              "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
00801              "paddw %%mm3, %%mm1             \n\t"\
00802              "paddw %%mm3, %%mm5             \n\t"\
00803              "psraw $4, %%mm1                \n\t"\
00804              "psraw $4, %%mm5                \n\t"\
00805              "packuswb %%mm5, %%mm1          \n\t"\
00806              "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
00807 
00808 #define snow_inner_add_yblock_mmx_end(s_step)\
00809              "add $"s_step", %%"REG_S"             \n\t"\
00810              "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
00811              "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
00812              "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
00813              "add %%"REG_c", (%%"REG_a")     \n\t"\
00814              "add $"PTR_SIZE"*1, %1          \n\t"\
00815              "add %%"REG_c", %0              \n\t"\
00816              "dec %2                         \n\t"\
00817              "jnz 1b                         \n\t"\
00818              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
00819              :\
00820              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
00821              "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
00822 
00823 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
00824                       int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
00825 snow_inner_add_yblock_mmx_header
00826 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
00827 snow_inner_add_yblock_mmx_accum("2", "8", "0")
00828 snow_inner_add_yblock_mmx_accum("1", "128", "0")
00829 snow_inner_add_yblock_mmx_accum("0", "136", "0")
00830 snow_inner_add_yblock_mmx_mix("0", "0")
00831 snow_inner_add_yblock_mmx_end("16")
00832 }
00833 
00834 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
00835                       int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
00836 snow_inner_add_yblock_mmx_header
00837 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
00838 snow_inner_add_yblock_mmx_accum("2", "16", "0")
00839 snow_inner_add_yblock_mmx_accum("1", "512", "0")
00840 snow_inner_add_yblock_mmx_accum("0", "528", "0")
00841 snow_inner_add_yblock_mmx_mix("0", "0")
00842 
00843 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
00844 snow_inner_add_yblock_mmx_accum("2", "24", "8")
00845 snow_inner_add_yblock_mmx_accum("1", "520", "8")
00846 snow_inner_add_yblock_mmx_accum("0", "536", "8")
00847 snow_inner_add_yblock_mmx_mix("16", "8")
00848 snow_inner_add_yblock_mmx_end("32")
00849 }
00850 
00851 static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
00852                            int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
00853 
00854     if (b_w == 16)
00855         inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00856     else if (b_w == 8 && obmc_stride == 16) {
00857         if (!(b_h & 1))
00858             inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00859         else
00860             inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00861     } else
00862          ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00863 }
00864 
00865 static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
00866                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
00867     if (b_w == 16)
00868         inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00869     else if (b_w == 8 && obmc_stride == 16)
00870         inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00871     else
00872         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
00873 }
00874 
00875 void ff_dwt_init_x86(DWTContext *c)
00876 {
00877     mm_flags = mm_support();
00878 
00879     if (mm_flags & FF_MM_MMX) {
00880         if(mm_flags & FF_MM_SSE2 & 0){
00881             c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
00882 #if HAVE_7REGS
00883             c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
00884 #endif
00885             c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
00886         }
00887         else{
00888             if(mm_flags & FF_MM_MMX2){
00889             c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
00890 #if HAVE_7REGS
00891             c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
00892 #endif
00893             }
00894             c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
00895         }
00896     }
00897 }

Generated on Fri Sep 16 2011 17:17:47 for FFmpeg by  doxygen 1.7.1