• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/dsputil_mmx_rnd_template.c

Go to the documentation of this file.
00001 /*
00002  * DSP utils mmx functions are compiled twice for rnd/no_rnd
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
00007  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
00008  * and improved by Zdenek Kabelac <kabi@users.sf.net>
00009  *
00010  * This file is part of FFmpeg.
00011  *
00012  * FFmpeg is free software; you can redistribute it and/or
00013  * modify it under the terms of the GNU Lesser General Public
00014  * License as published by the Free Software Foundation; either
00015  * version 2.1 of the License, or (at your option) any later version.
00016  *
00017  * FFmpeg is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00020  * Lesser General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU Lesser General Public
00023  * License along with FFmpeg; if not, write to the Free Software
00024  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00025  */
00026 
00027 // put_pixels
00028 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00029 {
00030     MOVQ_BFE(mm6);
00031     __asm__ volatile(
00032         "lea    (%3, %3), %%"REG_a"     \n\t"
00033         ASMALIGN(3)
00034         "1:                             \n\t"
00035         "movq   (%1), %%mm0             \n\t"
00036         "movq   1(%1), %%mm1            \n\t"
00037         "movq   (%1, %3), %%mm2         \n\t"
00038         "movq   1(%1, %3), %%mm3        \n\t"
00039         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00040         "movq   %%mm4, (%2)             \n\t"
00041         "movq   %%mm5, (%2, %3)         \n\t"
00042         "add    %%"REG_a", %1           \n\t"
00043         "add    %%"REG_a", %2           \n\t"
00044         "movq   (%1), %%mm0             \n\t"
00045         "movq   1(%1), %%mm1            \n\t"
00046         "movq   (%1, %3), %%mm2         \n\t"
00047         "movq   1(%1, %3), %%mm3        \n\t"
00048         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00049         "movq   %%mm4, (%2)             \n\t"
00050         "movq   %%mm5, (%2, %3)         \n\t"
00051         "add    %%"REG_a", %1           \n\t"
00052         "add    %%"REG_a", %2           \n\t"
00053         "subl   $4, %0                  \n\t"
00054         "jnz    1b                      \n\t"
00055         :"+g"(h), "+S"(pixels), "+D"(block)
00056         :"r"((x86_reg)line_size)
00057         :REG_a, "memory");
00058 }
00059 
00060 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00061 {
00062     MOVQ_BFE(mm6);
00063     __asm__ volatile(
00064         "testl $1, %0                   \n\t"
00065         " jz 1f                         \n\t"
00066         "movq   (%1), %%mm0             \n\t"
00067         "movq   (%2), %%mm1             \n\t"
00068         "add    %4, %1                  \n\t"
00069         "add    $8, %2                  \n\t"
00070         PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
00071         "movq   %%mm4, (%3)             \n\t"
00072         "add    %5, %3                  \n\t"
00073         "decl   %0                      \n\t"
00074         ASMALIGN(3)
00075         "1:                             \n\t"
00076         "movq   (%1), %%mm0             \n\t"
00077         "movq   (%2), %%mm1             \n\t"
00078         "add    %4, %1                  \n\t"
00079         "movq   (%1), %%mm2             \n\t"
00080         "movq   8(%2), %%mm3            \n\t"
00081         "add    %4, %1                  \n\t"
00082         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00083         "movq   %%mm4, (%3)             \n\t"
00084         "add    %5, %3                  \n\t"
00085         "movq   %%mm5, (%3)             \n\t"
00086         "add    %5, %3                  \n\t"
00087         "movq   (%1), %%mm0             \n\t"
00088         "movq   16(%2), %%mm1           \n\t"
00089         "add    %4, %1                  \n\t"
00090         "movq   (%1), %%mm2             \n\t"
00091         "movq   24(%2), %%mm3           \n\t"
00092         "add    %4, %1                  \n\t"
00093         "add    $32, %2                 \n\t"
00094         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00095         "movq   %%mm4, (%3)             \n\t"
00096         "add    %5, %3                  \n\t"
00097         "movq   %%mm5, (%3)             \n\t"
00098         "add    %5, %3                  \n\t"
00099         "subl   $4, %0                  \n\t"
00100         "jnz    1b                      \n\t"
00101 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00102         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00103 #else
00104         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00105 #endif
00106         :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00107         :"memory");
00108 }
00109 
00110 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00111 {
00112     MOVQ_BFE(mm6);
00113     __asm__ volatile(
00114         "lea        (%3, %3), %%"REG_a" \n\t"
00115         ASMALIGN(3)
00116         "1:                             \n\t"
00117         "movq   (%1), %%mm0             \n\t"
00118         "movq   1(%1), %%mm1            \n\t"
00119         "movq   (%1, %3), %%mm2         \n\t"
00120         "movq   1(%1, %3), %%mm3        \n\t"
00121         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00122         "movq   %%mm4, (%2)             \n\t"
00123         "movq   %%mm5, (%2, %3)         \n\t"
00124         "movq   8(%1), %%mm0            \n\t"
00125         "movq   9(%1), %%mm1            \n\t"
00126         "movq   8(%1, %3), %%mm2        \n\t"
00127         "movq   9(%1, %3), %%mm3        \n\t"
00128         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00129         "movq   %%mm4, 8(%2)            \n\t"
00130         "movq   %%mm5, 8(%2, %3)        \n\t"
00131         "add    %%"REG_a", %1           \n\t"
00132         "add    %%"REG_a", %2           \n\t"
00133         "movq   (%1), %%mm0             \n\t"
00134         "movq   1(%1), %%mm1            \n\t"
00135         "movq   (%1, %3), %%mm2         \n\t"
00136         "movq   1(%1, %3), %%mm3        \n\t"
00137         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00138         "movq   %%mm4, (%2)             \n\t"
00139         "movq   %%mm5, (%2, %3)         \n\t"
00140         "movq   8(%1), %%mm0            \n\t"
00141         "movq   9(%1), %%mm1            \n\t"
00142         "movq   8(%1, %3), %%mm2        \n\t"
00143         "movq   9(%1, %3), %%mm3        \n\t"
00144         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00145         "movq   %%mm4, 8(%2)            \n\t"
00146         "movq   %%mm5, 8(%2, %3)        \n\t"
00147         "add    %%"REG_a", %1           \n\t"
00148         "add    %%"REG_a", %2           \n\t"
00149         "subl   $4, %0                  \n\t"
00150         "jnz    1b                      \n\t"
00151         :"+g"(h), "+S"(pixels), "+D"(block)
00152         :"r"((x86_reg)line_size)
00153         :REG_a, "memory");
00154 }
00155 
00156 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00157 {
00158     MOVQ_BFE(mm6);
00159     __asm__ volatile(
00160         "testl $1, %0                   \n\t"
00161         " jz 1f                         \n\t"
00162         "movq   (%1), %%mm0             \n\t"
00163         "movq   (%2), %%mm1             \n\t"
00164         "movq   8(%1), %%mm2            \n\t"
00165         "movq   8(%2), %%mm3            \n\t"
00166         "add    %4, %1                  \n\t"
00167         "add    $16, %2                 \n\t"
00168         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00169         "movq   %%mm4, (%3)             \n\t"
00170         "movq   %%mm5, 8(%3)            \n\t"
00171         "add    %5, %3                  \n\t"
00172         "decl   %0                      \n\t"
00173         ASMALIGN(3)
00174         "1:                             \n\t"
00175         "movq   (%1), %%mm0             \n\t"
00176         "movq   (%2), %%mm1             \n\t"
00177         "movq   8(%1), %%mm2            \n\t"
00178         "movq   8(%2), %%mm3            \n\t"
00179         "add    %4, %1                  \n\t"
00180         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00181         "movq   %%mm4, (%3)             \n\t"
00182         "movq   %%mm5, 8(%3)            \n\t"
00183         "add    %5, %3                  \n\t"
00184         "movq   (%1), %%mm0             \n\t"
00185         "movq   16(%2), %%mm1           \n\t"
00186         "movq   8(%1), %%mm2            \n\t"
00187         "movq   24(%2), %%mm3           \n\t"
00188         "add    %4, %1                  \n\t"
00189         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
00190         "movq   %%mm4, (%3)             \n\t"
00191         "movq   %%mm5, 8(%3)            \n\t"
00192         "add    %5, %3                  \n\t"
00193         "add    $32, %2                 \n\t"
00194         "subl   $2, %0                  \n\t"
00195         "jnz    1b                      \n\t"
00196 #if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00197         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00198 #else
00199         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00200 #endif
00201         :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00202         :"memory");
00203 }
00204 
00205 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00206 {
00207     MOVQ_BFE(mm6);
00208     __asm__ volatile(
00209         "lea (%3, %3), %%"REG_a"        \n\t"
00210         "movq (%1), %%mm0               \n\t"
00211         ASMALIGN(3)
00212         "1:                             \n\t"
00213         "movq   (%1, %3), %%mm1         \n\t"
00214         "movq   (%1, %%"REG_a"),%%mm2   \n\t"
00215         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
00216         "movq   %%mm4, (%2)             \n\t"
00217         "movq   %%mm5, (%2, %3)         \n\t"
00218         "add    %%"REG_a", %1           \n\t"
00219         "add    %%"REG_a", %2           \n\t"
00220         "movq   (%1, %3), %%mm1         \n\t"
00221         "movq   (%1, %%"REG_a"),%%mm0   \n\t"
00222         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
00223         "movq   %%mm4, (%2)             \n\t"
00224         "movq   %%mm5, (%2, %3)         \n\t"
00225         "add    %%"REG_a", %1           \n\t"
00226         "add    %%"REG_a", %2           \n\t"
00227         "subl   $4, %0                  \n\t"
00228         "jnz    1b                      \n\t"
00229         :"+g"(h), "+S"(pixels), "+D"(block)
00230         :"r"((x86_reg)line_size)
00231         :REG_a, "memory");
00232 }
00233 
00234 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00235 {
00236     MOVQ_ZERO(mm7);
00237     SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
00238     __asm__ volatile(
00239         "movq   (%1), %%mm0             \n\t"
00240         "movq   1(%1), %%mm4            \n\t"
00241         "movq   %%mm0, %%mm1            \n\t"
00242         "movq   %%mm4, %%mm5            \n\t"
00243         "punpcklbw %%mm7, %%mm0         \n\t"
00244         "punpcklbw %%mm7, %%mm4         \n\t"
00245         "punpckhbw %%mm7, %%mm1         \n\t"
00246         "punpckhbw %%mm7, %%mm5         \n\t"
00247         "paddusw %%mm0, %%mm4           \n\t"
00248         "paddusw %%mm1, %%mm5           \n\t"
00249         "xor    %%"REG_a", %%"REG_a"    \n\t"
00250         "add    %3, %1                  \n\t"
00251         ASMALIGN(3)
00252         "1:                             \n\t"
00253         "movq   (%1, %%"REG_a"), %%mm0  \n\t"
00254         "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
00255         "movq   %%mm0, %%mm1            \n\t"
00256         "movq   %%mm2, %%mm3            \n\t"
00257         "punpcklbw %%mm7, %%mm0         \n\t"
00258         "punpcklbw %%mm7, %%mm2         \n\t"
00259         "punpckhbw %%mm7, %%mm1         \n\t"
00260         "punpckhbw %%mm7, %%mm3         \n\t"
00261         "paddusw %%mm2, %%mm0           \n\t"
00262         "paddusw %%mm3, %%mm1           \n\t"
00263         "paddusw %%mm6, %%mm4           \n\t"
00264         "paddusw %%mm6, %%mm5           \n\t"
00265         "paddusw %%mm0, %%mm4           \n\t"
00266         "paddusw %%mm1, %%mm5           \n\t"
00267         "psrlw  $2, %%mm4               \n\t"
00268         "psrlw  $2, %%mm5               \n\t"
00269         "packuswb  %%mm5, %%mm4         \n\t"
00270         "movq   %%mm4, (%2, %%"REG_a")  \n\t"
00271         "add    %3, %%"REG_a"           \n\t"
00272 
00273         "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
00274         "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
00275         "movq   %%mm2, %%mm3            \n\t"
00276         "movq   %%mm4, %%mm5            \n\t"
00277         "punpcklbw %%mm7, %%mm2         \n\t"
00278         "punpcklbw %%mm7, %%mm4         \n\t"
00279         "punpckhbw %%mm7, %%mm3         \n\t"
00280         "punpckhbw %%mm7, %%mm5         \n\t"
00281         "paddusw %%mm2, %%mm4           \n\t"
00282         "paddusw %%mm3, %%mm5           \n\t"
00283         "paddusw %%mm6, %%mm0           \n\t"
00284         "paddusw %%mm6, %%mm1           \n\t"
00285         "paddusw %%mm4, %%mm0           \n\t"
00286         "paddusw %%mm5, %%mm1           \n\t"
00287         "psrlw  $2, %%mm0               \n\t"
00288         "psrlw  $2, %%mm1               \n\t"
00289         "packuswb  %%mm1, %%mm0         \n\t"
00290         "movq   %%mm0, (%2, %%"REG_a")  \n\t"
00291         "add    %3, %%"REG_a"           \n\t"
00292 
00293         "subl   $2, %0                  \n\t"
00294         "jnz    1b                      \n\t"
00295         :"+g"(h), "+S"(pixels)
00296         :"D"(block), "r"((x86_reg)line_size)
00297         :REG_a, "memory");
00298 }
00299 
00300 // avg_pixels
00301 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00302 {
00303     MOVQ_BFE(mm6);
00304     JUMPALIGN();
00305     do {
00306         __asm__ volatile(
00307              "movd  %0, %%mm0           \n\t"
00308              "movd  %1, %%mm1           \n\t"
00309              OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
00310              "movd  %%mm2, %0           \n\t"
00311              :"+m"(*block)
00312              :"m"(*pixels)
00313              :"memory");
00314         pixels += line_size;
00315         block += line_size;
00316     }
00317     while (--h);
00318 }
00319 
00320 // in case more speed is needed - unroling would certainly help
00321 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00322 {
00323     MOVQ_BFE(mm6);
00324     JUMPALIGN();
00325     do {
00326         __asm__ volatile(
00327              "movq  %0, %%mm0           \n\t"
00328              "movq  %1, %%mm1           \n\t"
00329              OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
00330              "movq  %%mm2, %0           \n\t"
00331              :"+m"(*block)
00332              :"m"(*pixels)
00333              :"memory");
00334         pixels += line_size;
00335         block += line_size;
00336     }
00337     while (--h);
00338 }
00339 
00340 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00341 {
00342     MOVQ_BFE(mm6);
00343     JUMPALIGN();
00344     do {
00345         __asm__ volatile(
00346              "movq  %0, %%mm0           \n\t"
00347              "movq  %1, %%mm1           \n\t"
00348              OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
00349              "movq  %%mm2, %0           \n\t"
00350              "movq  8%0, %%mm0          \n\t"
00351              "movq  8%1, %%mm1          \n\t"
00352              OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
00353              "movq  %%mm2, 8%0          \n\t"
00354              :"+m"(*block)
00355              :"m"(*pixels)
00356              :"memory");
00357         pixels += line_size;
00358         block += line_size;
00359     }
00360     while (--h);
00361 }
00362 
00363 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00364 {
00365     MOVQ_BFE(mm6);
00366     JUMPALIGN();
00367     do {
00368         __asm__ volatile(
00369             "movq  %1, %%mm0            \n\t"
00370             "movq  1%1, %%mm1           \n\t"
00371             "movq  %0, %%mm3            \n\t"
00372             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
00373             OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
00374             "movq  %%mm0, %0            \n\t"
00375             :"+m"(*block)
00376             :"m"(*pixels)
00377             :"memory");
00378         pixels += line_size;
00379         block += line_size;
00380     } while (--h);
00381 }
00382 
00383 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00384 {
00385     MOVQ_BFE(mm6);
00386     JUMPALIGN();
00387     do {
00388         __asm__ volatile(
00389             "movq  %1, %%mm0            \n\t"
00390             "movq  %2, %%mm1            \n\t"
00391             "movq  %0, %%mm3            \n\t"
00392             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
00393             OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
00394             "movq  %%mm0, %0            \n\t"
00395             :"+m"(*dst)
00396             :"m"(*src1), "m"(*src2)
00397             :"memory");
00398         dst += dstStride;
00399         src1 += src1Stride;
00400         src2 += 8;
00401     } while (--h);
00402 }
00403 
00404 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00405 {
00406     MOVQ_BFE(mm6);
00407     JUMPALIGN();
00408     do {
00409         __asm__ volatile(
00410             "movq  %1, %%mm0            \n\t"
00411             "movq  1%1, %%mm1           \n\t"
00412             "movq  %0, %%mm3            \n\t"
00413             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
00414             OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
00415             "movq  %%mm0, %0            \n\t"
00416             "movq  8%1, %%mm0           \n\t"
00417             "movq  9%1, %%mm1           \n\t"
00418             "movq  8%0, %%mm3           \n\t"
00419             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
00420             OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
00421             "movq  %%mm0, 8%0           \n\t"
00422             :"+m"(*block)
00423             :"m"(*pixels)
00424             :"memory");
00425         pixels += line_size;
00426         block += line_size;
00427     } while (--h);
00428 }
00429 
00430 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00431 {
00432     MOVQ_BFE(mm6);
00433     JUMPALIGN();
00434     do {
00435         __asm__ volatile(
00436             "movq  %1, %%mm0            \n\t"
00437             "movq  %2, %%mm1            \n\t"
00438             "movq  %0, %%mm3            \n\t"
00439             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
00440             OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
00441             "movq  %%mm0, %0            \n\t"
00442             "movq  8%1, %%mm0           \n\t"
00443             "movq  8%2, %%mm1           \n\t"
00444             "movq  8%0, %%mm3           \n\t"
00445             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
00446             OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
00447             "movq  %%mm0, 8%0           \n\t"
00448             :"+m"(*dst)
00449             :"m"(*src1), "m"(*src2)
00450             :"memory");
00451         dst += dstStride;
00452         src1 += src1Stride;
00453         src2 += 16;
00454     } while (--h);
00455 }
00456 
00457 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00458 {
00459     MOVQ_BFE(mm6);
00460     __asm__ volatile(
00461         "lea    (%3, %3), %%"REG_a"     \n\t"
00462         "movq   (%1), %%mm0             \n\t"
00463         ASMALIGN(3)
00464         "1:                             \n\t"
00465         "movq   (%1, %3), %%mm1         \n\t"
00466         "movq   (%1, %%"REG_a"), %%mm2  \n\t"
00467         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
00468         "movq   (%2), %%mm3             \n\t"
00469         OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
00470         "movq   (%2, %3), %%mm3         \n\t"
00471         OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
00472         "movq   %%mm0, (%2)             \n\t"
00473         "movq   %%mm1, (%2, %3)         \n\t"
00474         "add    %%"REG_a", %1           \n\t"
00475         "add    %%"REG_a", %2           \n\t"
00476 
00477         "movq   (%1, %3), %%mm1         \n\t"
00478         "movq   (%1, %%"REG_a"), %%mm0  \n\t"
00479         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
00480         "movq   (%2), %%mm3             \n\t"
00481         OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
00482         "movq   (%2, %3), %%mm3         \n\t"
00483         OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
00484         "movq   %%mm2, (%2)             \n\t"
00485         "movq   %%mm1, (%2, %3)         \n\t"
00486         "add    %%"REG_a", %1           \n\t"
00487         "add    %%"REG_a", %2           \n\t"
00488 
00489         "subl   $4, %0                  \n\t"
00490         "jnz    1b                      \n\t"
00491         :"+g"(h), "+S"(pixels), "+D"(block)
00492         :"r"((x86_reg)line_size)
00493         :REG_a, "memory");
00494 }
00495 
00496 // this routine is 'slightly' suboptimal but mostly unused
00497 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00498 {
00499     MOVQ_ZERO(mm7);
00500     SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
00501     __asm__ volatile(
00502         "movq   (%1), %%mm0             \n\t"
00503         "movq   1(%1), %%mm4            \n\t"
00504         "movq   %%mm0, %%mm1            \n\t"
00505         "movq   %%mm4, %%mm5            \n\t"
00506         "punpcklbw %%mm7, %%mm0         \n\t"
00507         "punpcklbw %%mm7, %%mm4         \n\t"
00508         "punpckhbw %%mm7, %%mm1         \n\t"
00509         "punpckhbw %%mm7, %%mm5         \n\t"
00510         "paddusw %%mm0, %%mm4           \n\t"
00511         "paddusw %%mm1, %%mm5           \n\t"
00512         "xor    %%"REG_a", %%"REG_a"    \n\t"
00513         "add    %3, %1                  \n\t"
00514         ASMALIGN(3)
00515         "1:                             \n\t"
00516         "movq   (%1, %%"REG_a"), %%mm0  \n\t"
00517         "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
00518         "movq   %%mm0, %%mm1            \n\t"
00519         "movq   %%mm2, %%mm3            \n\t"
00520         "punpcklbw %%mm7, %%mm0         \n\t"
00521         "punpcklbw %%mm7, %%mm2         \n\t"
00522         "punpckhbw %%mm7, %%mm1         \n\t"
00523         "punpckhbw %%mm7, %%mm3         \n\t"
00524         "paddusw %%mm2, %%mm0           \n\t"
00525         "paddusw %%mm3, %%mm1           \n\t"
00526         "paddusw %%mm6, %%mm4           \n\t"
00527         "paddusw %%mm6, %%mm5           \n\t"
00528         "paddusw %%mm0, %%mm4           \n\t"
00529         "paddusw %%mm1, %%mm5           \n\t"
00530         "psrlw  $2, %%mm4               \n\t"
00531         "psrlw  $2, %%mm5               \n\t"
00532                 "movq   (%2, %%"REG_a"), %%mm3  \n\t"
00533         "packuswb  %%mm5, %%mm4         \n\t"
00534                 "pcmpeqd %%mm2, %%mm2   \n\t"
00535                 "paddb %%mm2, %%mm2     \n\t"
00536                 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
00537                 "movq   %%mm5, (%2, %%"REG_a")  \n\t"
00538         "add    %3, %%"REG_a"                \n\t"
00539 
00540         "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
00541         "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
00542         "movq   %%mm2, %%mm3            \n\t"
00543         "movq   %%mm4, %%mm5            \n\t"
00544         "punpcklbw %%mm7, %%mm2         \n\t"
00545         "punpcklbw %%mm7, %%mm4         \n\t"
00546         "punpckhbw %%mm7, %%mm3         \n\t"
00547         "punpckhbw %%mm7, %%mm5         \n\t"
00548         "paddusw %%mm2, %%mm4           \n\t"
00549         "paddusw %%mm3, %%mm5           \n\t"
00550         "paddusw %%mm6, %%mm0           \n\t"
00551         "paddusw %%mm6, %%mm1           \n\t"
00552         "paddusw %%mm4, %%mm0           \n\t"
00553         "paddusw %%mm5, %%mm1           \n\t"
00554         "psrlw  $2, %%mm0               \n\t"
00555         "psrlw  $2, %%mm1               \n\t"
00556                 "movq   (%2, %%"REG_a"), %%mm3  \n\t"
00557         "packuswb  %%mm1, %%mm0         \n\t"
00558                 "pcmpeqd %%mm2, %%mm2   \n\t"
00559                 "paddb %%mm2, %%mm2     \n\t"
00560                 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
00561                 "movq   %%mm1, (%2, %%"REG_a")  \n\t"
00562         "add    %3, %%"REG_a"           \n\t"
00563 
00564         "subl   $2, %0                  \n\t"
00565         "jnz    1b                      \n\t"
00566         :"+g"(h), "+S"(pixels)
00567         :"D"(block), "r"((x86_reg)line_size)
00568         :REG_a, "memory");
00569 }
00570 
00571 //FIXME optimize
00572 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00573     DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
00574     DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
00575 }
00576 
00577 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00578     DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
00579     DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
00580 }
00581 
00582 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00583     DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
00584     DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
00585 }
00586 
00587 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00588     DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
00589     DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
00590 }

Generated on Fri Sep 16 2011 17:17:46 for FFmpeg by  doxygen 1.7.1