• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libswscale/swscale_template.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #undef REAL_MOVNTQ
00022 #undef MOVNTQ
00023 #undef PAVGB
00024 #undef PREFETCH
00025 
00026 #if COMPILE_TEMPLATE_AMD3DNOW
00027 #define PREFETCH  "prefetch"
00028 #elif COMPILE_TEMPLATE_MMX2
00029 #define PREFETCH "prefetchnta"
00030 #else
00031 #define PREFETCH  " # nop"
00032 #endif
00033 
00034 #if COMPILE_TEMPLATE_MMX2
00035 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
00036 #elif COMPILE_TEMPLATE_AMD3DNOW
00037 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
00038 #endif
00039 
00040 #if COMPILE_TEMPLATE_MMX2
00041 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00042 #else
00043 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00044 #endif
00045 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
00046 
00047 #if COMPILE_TEMPLATE_ALTIVEC
00048 #include "ppc/swscale_altivec_template.c"
00049 #endif
00050 
00051 #define YSCALEYUV2YV12X(x, offset, dest, width) \
00052     __asm__ volatile(\
00053         "xor                          %%"REG_a", %%"REG_a"  \n\t"\
00054         "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
00055         "movq                             %%mm3, %%mm4      \n\t"\
00056         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00057         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00058         ASMALIGN(4) /* FIXME Unroll? */\
00059         "1:                                                 \n\t"\
00060         "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
00061         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
00062         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
00063         "add                                $16, %%"REG_d"  \n\t"\
00064         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00065         "test                         %%"REG_S", %%"REG_S"  \n\t"\
00066         "pmulhw                           %%mm0, %%mm2      \n\t"\
00067         "pmulhw                           %%mm0, %%mm5      \n\t"\
00068         "paddw                            %%mm2, %%mm3      \n\t"\
00069         "paddw                            %%mm5, %%mm4      \n\t"\
00070         " jnz                                1b             \n\t"\
00071         "psraw                               $3, %%mm3      \n\t"\
00072         "psraw                               $3, %%mm4      \n\t"\
00073         "packuswb                         %%mm4, %%mm3      \n\t"\
00074         MOVNTQ(%%mm3, (%1, %%REGa))\
00075         "add                                 $8, %%"REG_a"  \n\t"\
00076         "cmp                                 %2, %%"REG_a"  \n\t"\
00077         "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
00078         "movq                             %%mm3, %%mm4      \n\t"\
00079         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00080         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00081         "jb                                  1b             \n\t"\
00082         :: "r" (&c->redDither),\
00083         "r" (dest), "g" (width)\
00084         : "%"REG_a, "%"REG_d, "%"REG_S\
00085     );
00086 
00087 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
00088     __asm__ volatile(\
00089         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00090         "xor                          %%"REG_a", %%"REG_a"  \n\t"\
00091         "pxor                             %%mm4, %%mm4      \n\t"\
00092         "pxor                             %%mm5, %%mm5      \n\t"\
00093         "pxor                             %%mm6, %%mm6      \n\t"\
00094         "pxor                             %%mm7, %%mm7      \n\t"\
00095         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00096         ASMALIGN(4) \
00097         "1:                                                 \n\t"\
00098         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
00099         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
00100         "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
00101         "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
00102         "movq                             %%mm0, %%mm3      \n\t"\
00103         "punpcklwd                        %%mm1, %%mm0      \n\t"\
00104         "punpckhwd                        %%mm1, %%mm3      \n\t"\
00105         "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
00106         "pmaddwd                          %%mm1, %%mm0      \n\t"\
00107         "pmaddwd                          %%mm1, %%mm3      \n\t"\
00108         "paddd                            %%mm0, %%mm4      \n\t"\
00109         "paddd                            %%mm3, %%mm5      \n\t"\
00110         "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
00111         "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
00112         "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
00113         "test                         %%"REG_S", %%"REG_S"  \n\t"\
00114         "movq                             %%mm2, %%mm0      \n\t"\
00115         "punpcklwd                        %%mm3, %%mm2      \n\t"\
00116         "punpckhwd                        %%mm3, %%mm0      \n\t"\
00117         "pmaddwd                          %%mm1, %%mm2      \n\t"\
00118         "pmaddwd                          %%mm1, %%mm0      \n\t"\
00119         "paddd                            %%mm2, %%mm6      \n\t"\
00120         "paddd                            %%mm0, %%mm7      \n\t"\
00121         " jnz                                1b             \n\t"\
00122         "psrad                              $16, %%mm4      \n\t"\
00123         "psrad                              $16, %%mm5      \n\t"\
00124         "psrad                              $16, %%mm6      \n\t"\
00125         "psrad                              $16, %%mm7      \n\t"\
00126         "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
00127         "packssdw                         %%mm5, %%mm4      \n\t"\
00128         "packssdw                         %%mm7, %%mm6      \n\t"\
00129         "paddw                            %%mm0, %%mm4      \n\t"\
00130         "paddw                            %%mm0, %%mm6      \n\t"\
00131         "psraw                               $3, %%mm4      \n\t"\
00132         "psraw                               $3, %%mm6      \n\t"\
00133         "packuswb                         %%mm6, %%mm4      \n\t"\
00134         MOVNTQ(%%mm4, (%1, %%REGa))\
00135         "add                                 $8, %%"REG_a"  \n\t"\
00136         "cmp                                 %2, %%"REG_a"  \n\t"\
00137         "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00138         "pxor                             %%mm4, %%mm4      \n\t"\
00139         "pxor                             %%mm5, %%mm5      \n\t"\
00140         "pxor                             %%mm6, %%mm6      \n\t"\
00141         "pxor                             %%mm7, %%mm7      \n\t"\
00142         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00143         "jb                                  1b             \n\t"\
00144         :: "r" (&c->redDither),\
00145         "r" (dest), "g" (width)\
00146         : "%"REG_a, "%"REG_d, "%"REG_S\
00147     );
00148 
00149 #define YSCALEYUV2YV121 \
00150     "mov %2, %%"REG_a"                    \n\t"\
00151     ASMALIGN(4) /* FIXME Unroll? */\
00152     "1:                                   \n\t"\
00153     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
00154     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
00155     "psraw                 $7, %%mm0      \n\t"\
00156     "psraw                 $7, %%mm1      \n\t"\
00157     "packuswb           %%mm1, %%mm0      \n\t"\
00158     MOVNTQ(%%mm0, (%1, %%REGa))\
00159     "add                   $8, %%"REG_a"  \n\t"\
00160     "jnc                   1b             \n\t"
00161 
00162 #define YSCALEYUV2YV121_ACCURATE \
00163     "mov %2, %%"REG_a"                    \n\t"\
00164     "pcmpeqw %%mm7, %%mm7                 \n\t"\
00165     "psrlw                 $15, %%mm7     \n\t"\
00166     "psllw                  $6, %%mm7     \n\t"\
00167     ASMALIGN(4) /* FIXME Unroll? */\
00168     "1:                                   \n\t"\
00169     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
00170     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
00171     "paddsw             %%mm7, %%mm0      \n\t"\
00172     "paddsw             %%mm7, %%mm1      \n\t"\
00173     "psraw                 $7, %%mm0      \n\t"\
00174     "psraw                 $7, %%mm1      \n\t"\
00175     "packuswb           %%mm1, %%mm0      \n\t"\
00176     MOVNTQ(%%mm0, (%1, %%REGa))\
00177     "add                   $8, %%"REG_a"  \n\t"\
00178     "jnc                   1b             \n\t"
00179 
00180 /*
00181     :: "m" (-lumFilterSize), "m" (-chrFilterSize),
00182        "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
00183        "r" (dest), "m" (dstW),
00184        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
00185     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
00186 */
00187 #define YSCALEYUV2PACKEDX_UV \
00188     __asm__ volatile(\
00189         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
00190         ASMALIGN(4)\
00191         "nop                                            \n\t"\
00192         "1:                                             \n\t"\
00193         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00194         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00195         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
00196         "movq                      %%mm3, %%mm4         \n\t"\
00197         ASMALIGN(4)\
00198         "2:                                             \n\t"\
00199         "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
00200         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
00201         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
00202         "add                         $16, %%"REG_d"     \n\t"\
00203         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00204         "pmulhw                    %%mm0, %%mm2         \n\t"\
00205         "pmulhw                    %%mm0, %%mm5         \n\t"\
00206         "paddw                     %%mm2, %%mm3         \n\t"\
00207         "paddw                     %%mm5, %%mm4         \n\t"\
00208         "test                  %%"REG_S", %%"REG_S"     \n\t"\
00209         " jnz                         2b                \n\t"\
00210 
00211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
00212     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00213     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00214     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
00215     "movq                    "#dst1", "#dst2"       \n\t"\
00216     ASMALIGN(4)\
00217     "2:                                             \n\t"\
00218     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
00219     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
00220     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
00221     "add                         $16, %%"REG_d"            \n\t"\
00222     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00223     "pmulhw                 "#coeff", "#src1"       \n\t"\
00224     "pmulhw                 "#coeff", "#src2"       \n\t"\
00225     "paddw                   "#src1", "#dst1"       \n\t"\
00226     "paddw                   "#src2", "#dst2"       \n\t"\
00227     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00228     " jnz                         2b                \n\t"\
00229 
00230 #define YSCALEYUV2PACKEDX \
00231     YSCALEYUV2PACKEDX_UV \
00232     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
00233 
00234 #define YSCALEYUV2PACKEDX_END                     \
00235         :: "r" (&c->redDither),                   \
00236             "m" (dummy), "m" (dummy), "m" (dummy),\
00237             "r" (dest), "m" (dstW)                \
00238         : "%"REG_a, "%"REG_d, "%"REG_S            \
00239     );
00240 
00241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
00242     __asm__ volatile(\
00243         "xor %%"REG_a", %%"REG_a"                       \n\t"\
00244         ASMALIGN(4)\
00245         "nop                                            \n\t"\
00246         "1:                                             \n\t"\
00247         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00248         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00249         "pxor                      %%mm4, %%mm4         \n\t"\
00250         "pxor                      %%mm5, %%mm5         \n\t"\
00251         "pxor                      %%mm6, %%mm6         \n\t"\
00252         "pxor                      %%mm7, %%mm7         \n\t"\
00253         ASMALIGN(4)\
00254         "2:                                             \n\t"\
00255         "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
00256         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
00257         "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00258         "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
00259         "movq                      %%mm0, %%mm3         \n\t"\
00260         "punpcklwd                 %%mm1, %%mm0         \n\t"\
00261         "punpckhwd                 %%mm1, %%mm3         \n\t"\
00262         "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
00263         "pmaddwd                   %%mm1, %%mm0         \n\t"\
00264         "pmaddwd                   %%mm1, %%mm3         \n\t"\
00265         "paddd                     %%mm0, %%mm4         \n\t"\
00266         "paddd                     %%mm3, %%mm5         \n\t"\
00267         "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
00268         "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00269         "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00270         "test                  %%"REG_S", %%"REG_S"     \n\t"\
00271         "movq                      %%mm2, %%mm0         \n\t"\
00272         "punpcklwd                 %%mm3, %%mm2         \n\t"\
00273         "punpckhwd                 %%mm3, %%mm0         \n\t"\
00274         "pmaddwd                   %%mm1, %%mm2         \n\t"\
00275         "pmaddwd                   %%mm1, %%mm0         \n\t"\
00276         "paddd                     %%mm2, %%mm6         \n\t"\
00277         "paddd                     %%mm0, %%mm7         \n\t"\
00278         " jnz                         2b                \n\t"\
00279         "psrad                       $16, %%mm4         \n\t"\
00280         "psrad                       $16, %%mm5         \n\t"\
00281         "psrad                       $16, %%mm6         \n\t"\
00282         "psrad                       $16, %%mm7         \n\t"\
00283         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00284         "packssdw                  %%mm5, %%mm4         \n\t"\
00285         "packssdw                  %%mm7, %%mm6         \n\t"\
00286         "paddw                     %%mm0, %%mm4         \n\t"\
00287         "paddw                     %%mm0, %%mm6         \n\t"\
00288         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
00289         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
00290 
00291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
00292     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00293     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00294     "pxor                      %%mm1, %%mm1         \n\t"\
00295     "pxor                      %%mm5, %%mm5         \n\t"\
00296     "pxor                      %%mm7, %%mm7         \n\t"\
00297     "pxor                      %%mm6, %%mm6         \n\t"\
00298     ASMALIGN(4)\
00299     "2:                                             \n\t"\
00300     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
00301     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
00302     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00303     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
00304     "movq                      %%mm0, %%mm3         \n\t"\
00305     "punpcklwd                 %%mm4, %%mm0         \n\t"\
00306     "punpckhwd                 %%mm4, %%mm3         \n\t"\
00307     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
00308     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00309     "pmaddwd                   %%mm4, %%mm3         \n\t"\
00310     "paddd                     %%mm0, %%mm1         \n\t"\
00311     "paddd                     %%mm3, %%mm5         \n\t"\
00312     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
00313     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00314     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00315     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00316     "movq                      %%mm2, %%mm0         \n\t"\
00317     "punpcklwd                 %%mm3, %%mm2         \n\t"\
00318     "punpckhwd                 %%mm3, %%mm0         \n\t"\
00319     "pmaddwd                   %%mm4, %%mm2         \n\t"\
00320     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00321     "paddd                     %%mm2, %%mm7         \n\t"\
00322     "paddd                     %%mm0, %%mm6         \n\t"\
00323     " jnz                         2b                \n\t"\
00324     "psrad                       $16, %%mm1         \n\t"\
00325     "psrad                       $16, %%mm5         \n\t"\
00326     "psrad                       $16, %%mm7         \n\t"\
00327     "psrad                       $16, %%mm6         \n\t"\
00328     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00329     "packssdw                  %%mm5, %%mm1         \n\t"\
00330     "packssdw                  %%mm6, %%mm7         \n\t"\
00331     "paddw                     %%mm0, %%mm1         \n\t"\
00332     "paddw                     %%mm0, %%mm7         \n\t"\
00333     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
00334     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
00335 
00336 #define YSCALEYUV2PACKEDX_ACCURATE \
00337     YSCALEYUV2PACKEDX_ACCURATE_UV \
00338     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
00339 
00340 #define YSCALEYUV2RGBX \
00341     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
00342     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
00343     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
00344     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
00345     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
00346     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
00347     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00348     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
00349     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
00350     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
00351     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
00352     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
00353     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
00354     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00355     "paddw           %%mm3, %%mm4       \n\t"\
00356     "movq            %%mm2, %%mm0       \n\t"\
00357     "movq            %%mm5, %%mm6       \n\t"\
00358     "movq            %%mm4, %%mm3       \n\t"\
00359     "punpcklwd       %%mm2, %%mm2       \n\t"\
00360     "punpcklwd       %%mm5, %%mm5       \n\t"\
00361     "punpcklwd       %%mm4, %%mm4       \n\t"\
00362     "paddw           %%mm1, %%mm2       \n\t"\
00363     "paddw           %%mm1, %%mm5       \n\t"\
00364     "paddw           %%mm1, %%mm4       \n\t"\
00365     "punpckhwd       %%mm0, %%mm0       \n\t"\
00366     "punpckhwd       %%mm6, %%mm6       \n\t"\
00367     "punpckhwd       %%mm3, %%mm3       \n\t"\
00368     "paddw           %%mm7, %%mm0       \n\t"\
00369     "paddw           %%mm7, %%mm6       \n\t"\
00370     "paddw           %%mm7, %%mm3       \n\t"\
00371     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00372     "packuswb        %%mm0, %%mm2       \n\t"\
00373     "packuswb        %%mm6, %%mm5       \n\t"\
00374     "packuswb        %%mm3, %%mm4       \n\t"\
00375 
00376 #define REAL_YSCALEYUV2PACKED(index, c) \
00377     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
00378     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
00379     "psraw                $3, %%mm0                           \n\t"\
00380     "psraw                $3, %%mm1                           \n\t"\
00381     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00382     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00383     "xor            "#index", "#index"                        \n\t"\
00384     ASMALIGN(4)\
00385     "1:                                 \n\t"\
00386     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00387     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00388     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00389     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00390     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00391     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00392     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00393     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00394     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00395     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00396     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00397     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00398     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00399     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00400     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00401     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00402     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00403     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00404     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00405     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00406     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00407     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00408     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00409     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00410     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00411 
00412 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
00413 
00414 #define REAL_YSCALEYUV2RGB_UV(index, c) \
00415     "xor            "#index", "#index"  \n\t"\
00416     ASMALIGN(4)\
00417     "1:                                 \n\t"\
00418     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00419     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00420     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00421     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00422     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00423     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00424     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00425     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00426     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00427     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00428     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00429     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00430     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00431     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00432     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00433     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00434     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00435     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00436     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00437     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00438 
00439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
00440     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00441     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00442     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00443     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00444     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00445     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00446     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00447     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00448     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00449     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00450     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00451     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00452 
00453 #define REAL_YSCALEYUV2RGB_COEFF(c) \
00454     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00455     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00456     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00457     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00458     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00459     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00460     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00461     "paddw             %%mm3, %%mm4     \n\t"\
00462     "movq              %%mm2, %%mm0     \n\t"\
00463     "movq              %%mm5, %%mm6     \n\t"\
00464     "movq              %%mm4, %%mm3     \n\t"\
00465     "punpcklwd         %%mm2, %%mm2     \n\t"\
00466     "punpcklwd         %%mm5, %%mm5     \n\t"\
00467     "punpcklwd         %%mm4, %%mm4     \n\t"\
00468     "paddw             %%mm1, %%mm2     \n\t"\
00469     "paddw             %%mm1, %%mm5     \n\t"\
00470     "paddw             %%mm1, %%mm4     \n\t"\
00471     "punpckhwd         %%mm0, %%mm0     \n\t"\
00472     "punpckhwd         %%mm6, %%mm6     \n\t"\
00473     "punpckhwd         %%mm3, %%mm3     \n\t"\
00474     "paddw             %%mm7, %%mm0     \n\t"\
00475     "paddw             %%mm7, %%mm6     \n\t"\
00476     "paddw             %%mm7, %%mm3     \n\t"\
00477     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00478     "packuswb          %%mm0, %%mm2     \n\t"\
00479     "packuswb          %%mm6, %%mm5     \n\t"\
00480     "packuswb          %%mm3, %%mm4     \n\t"\
00481 
00482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
00483 
00484 #define YSCALEYUV2RGB(index, c) \
00485     REAL_YSCALEYUV2RGB_UV(index, c) \
00486     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
00487     REAL_YSCALEYUV2RGB_COEFF(c)
00488 
00489 #define REAL_YSCALEYUV2PACKED1(index, c) \
00490     "xor            "#index", "#index"  \n\t"\
00491     ASMALIGN(4)\
00492     "1:                                 \n\t"\
00493     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00494     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
00495     "psraw                $7, %%mm3     \n\t" \
00496     "psraw                $7, %%mm4     \n\t" \
00497     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00498     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00499     "psraw                $7, %%mm1     \n\t" \
00500     "psraw                $7, %%mm7     \n\t" \
00501 
00502 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
00503 
00504 #define REAL_YSCALEYUV2RGB1(index, c) \
00505     "xor            "#index", "#index"  \n\t"\
00506     ASMALIGN(4)\
00507     "1:                                 \n\t"\
00508     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00509     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
00510     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00511     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00512     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00513     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00514     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00515     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00516     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00517     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00518     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00519     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00520     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00521     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00522     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00523     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00524     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00525     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00526     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00527     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00528     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00529     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00530     "paddw             %%mm3, %%mm4     \n\t"\
00531     "movq              %%mm2, %%mm0     \n\t"\
00532     "movq              %%mm5, %%mm6     \n\t"\
00533     "movq              %%mm4, %%mm3     \n\t"\
00534     "punpcklwd         %%mm2, %%mm2     \n\t"\
00535     "punpcklwd         %%mm5, %%mm5     \n\t"\
00536     "punpcklwd         %%mm4, %%mm4     \n\t"\
00537     "paddw             %%mm1, %%mm2     \n\t"\
00538     "paddw             %%mm1, %%mm5     \n\t"\
00539     "paddw             %%mm1, %%mm4     \n\t"\
00540     "punpckhwd         %%mm0, %%mm0     \n\t"\
00541     "punpckhwd         %%mm6, %%mm6     \n\t"\
00542     "punpckhwd         %%mm3, %%mm3     \n\t"\
00543     "paddw             %%mm7, %%mm0     \n\t"\
00544     "paddw             %%mm7, %%mm6     \n\t"\
00545     "paddw             %%mm7, %%mm3     \n\t"\
00546     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00547     "packuswb          %%mm0, %%mm2     \n\t"\
00548     "packuswb          %%mm6, %%mm5     \n\t"\
00549     "packuswb          %%mm3, %%mm4     \n\t"\
00550 
00551 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
00552 
00553 #define REAL_YSCALEYUV2PACKED1b(index, c) \
00554     "xor "#index", "#index"             \n\t"\
00555     ASMALIGN(4)\
00556     "1:                                 \n\t"\
00557     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00558     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00559     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00560     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00561     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
00562     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
00563     "psrlw                $8, %%mm3     \n\t" \
00564     "psrlw                $8, %%mm4     \n\t" \
00565     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00566     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00567     "psraw                $7, %%mm1     \n\t" \
00568     "psraw                $7, %%mm7     \n\t"
00569 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
00570 
00571 // do vertical chrominance interpolation
00572 #define REAL_YSCALEYUV2RGB1b(index, c) \
00573     "xor            "#index", "#index"  \n\t"\
00574     ASMALIGN(4)\
00575     "1:                                 \n\t"\
00576     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00577     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00578     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00579     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00580     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
00581     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
00582     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
00583     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
00584     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00585     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00586     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00587     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00588     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00589     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00590     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00591     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00592     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00593     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00594     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00595     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00596     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00597     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00598     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00599     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00600     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00601     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00602     "paddw             %%mm3, %%mm4     \n\t"\
00603     "movq              %%mm2, %%mm0     \n\t"\
00604     "movq              %%mm5, %%mm6     \n\t"\
00605     "movq              %%mm4, %%mm3     \n\t"\
00606     "punpcklwd         %%mm2, %%mm2     \n\t"\
00607     "punpcklwd         %%mm5, %%mm5     \n\t"\
00608     "punpcklwd         %%mm4, %%mm4     \n\t"\
00609     "paddw             %%mm1, %%mm2     \n\t"\
00610     "paddw             %%mm1, %%mm5     \n\t"\
00611     "paddw             %%mm1, %%mm4     \n\t"\
00612     "punpckhwd         %%mm0, %%mm0     \n\t"\
00613     "punpckhwd         %%mm6, %%mm6     \n\t"\
00614     "punpckhwd         %%mm3, %%mm3     \n\t"\
00615     "paddw             %%mm7, %%mm0     \n\t"\
00616     "paddw             %%mm7, %%mm6     \n\t"\
00617     "paddw             %%mm7, %%mm3     \n\t"\
00618     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00619     "packuswb          %%mm0, %%mm2     \n\t"\
00620     "packuswb          %%mm6, %%mm5     \n\t"\
00621     "packuswb          %%mm3, %%mm4     \n\t"\
00622 
00623 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
00624 
00625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
00626     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
00627     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
00628     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
00629     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
00630     "packuswb          %%mm1, %%mm7     \n\t"
00631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
00632 
00633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
00634     "movq       "#b", "#q2"     \n\t" /* B */\
00635     "movq       "#r", "#t"      \n\t" /* R */\
00636     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
00637     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
00638     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
00639     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
00640     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
00641     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
00642     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
00643     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
00644     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
00645     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
00646 \
00647     MOVNTQ(   q0,   (dst, index, 4))\
00648     MOVNTQ(    b,  8(dst, index, 4))\
00649     MOVNTQ(   q2, 16(dst, index, 4))\
00650     MOVNTQ(   q3, 24(dst, index, 4))\
00651 \
00652     "add      $8, "#index"      \n\t"\
00653     "cmp "#dstw", "#index"      \n\t"\
00654     " jb      1b                \n\t"
00655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
00656 
00657 #define REAL_WRITERGB16(dst, dstw, index) \
00658     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00659     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
00660     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00661     "psrlq           $3, %%mm2  \n\t"\
00662 \
00663     "movq         %%mm2, %%mm1  \n\t"\
00664     "movq         %%mm4, %%mm3  \n\t"\
00665 \
00666     "punpcklbw    %%mm7, %%mm3  \n\t"\
00667     "punpcklbw    %%mm5, %%mm2  \n\t"\
00668     "punpckhbw    %%mm7, %%mm4  \n\t"\
00669     "punpckhbw    %%mm5, %%mm1  \n\t"\
00670 \
00671     "psllq           $3, %%mm3  \n\t"\
00672     "psllq           $3, %%mm4  \n\t"\
00673 \
00674     "por          %%mm3, %%mm2  \n\t"\
00675     "por          %%mm4, %%mm1  \n\t"\
00676 \
00677     MOVNTQ(%%mm2,  (dst, index, 2))\
00678     MOVNTQ(%%mm1, 8(dst, index, 2))\
00679 \
00680     "add             $8, "#index"   \n\t"\
00681     "cmp        "#dstw", "#index"   \n\t"\
00682     " jb             1b             \n\t"
00683 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
00684 
00685 #define REAL_WRITERGB15(dst, dstw, index) \
00686     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00687     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
00688     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00689     "psrlq           $3, %%mm2  \n\t"\
00690     "psrlq           $1, %%mm5  \n\t"\
00691 \
00692     "movq         %%mm2, %%mm1  \n\t"\
00693     "movq         %%mm4, %%mm3  \n\t"\
00694 \
00695     "punpcklbw    %%mm7, %%mm3  \n\t"\
00696     "punpcklbw    %%mm5, %%mm2  \n\t"\
00697     "punpckhbw    %%mm7, %%mm4  \n\t"\
00698     "punpckhbw    %%mm5, %%mm1  \n\t"\
00699 \
00700     "psllq           $2, %%mm3  \n\t"\
00701     "psllq           $2, %%mm4  \n\t"\
00702 \
00703     "por          %%mm3, %%mm2  \n\t"\
00704     "por          %%mm4, %%mm1  \n\t"\
00705 \
00706     MOVNTQ(%%mm2,  (dst, index, 2))\
00707     MOVNTQ(%%mm1, 8(dst, index, 2))\
00708 \
00709     "add             $8, "#index"   \n\t"\
00710     "cmp        "#dstw", "#index"   \n\t"\
00711     " jb             1b             \n\t"
00712 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
00713 
00714 #define WRITEBGR24OLD(dst, dstw, index) \
00715     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00716     "movq      %%mm2, %%mm1             \n\t" /* B */\
00717     "movq      %%mm5, %%mm6             \n\t" /* R */\
00718     "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
00719     "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
00720     "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
00721     "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
00722     "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
00723     "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
00724     "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
00725     "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
00726     "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
00727     "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
00728 \
00729     "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
00730     "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
00731     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
00732     "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
00733     "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
00734     "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
00735     "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
00736     "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
00737 \
00738     "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
00739     "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
00740     "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
00741     "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
00742     "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
00743     "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
00744     "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
00745     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
00746     "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
00747     "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
00748     "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
00749     "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
00750     "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
00751 \
00752     "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
00753     "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
00754     "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
00755     "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
00756     "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
00757     "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
00758     "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
00759     "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
00760 \
00761     MOVNTQ(%%mm0,   (dst))\
00762     MOVNTQ(%%mm2,  8(dst))\
00763     MOVNTQ(%%mm3, 16(dst))\
00764     "add         $24, "#dst"            \n\t"\
00765 \
00766     "add          $8, "#index"          \n\t"\
00767     "cmp     "#dstw", "#index"          \n\t"\
00768     " jb          1b                    \n\t"
00769 
00770 #define WRITEBGR24MMX(dst, dstw, index) \
00771     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00772     "movq      %%mm2, %%mm1     \n\t" /* B */\
00773     "movq      %%mm5, %%mm6     \n\t" /* R */\
00774     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
00775     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
00776     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
00777     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
00778     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
00779     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
00780     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
00781     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
00782     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
00783     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
00784 \
00785     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
00786     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
00787     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
00788     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
00789 \
00790     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
00791     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
00792     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
00793     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
00794 \
00795     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
00796     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
00797     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
00798     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
00799 \
00800     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
00801     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
00802     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
00803     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
00804     MOVNTQ(%%mm0, (dst))\
00805 \
00806     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
00807     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
00808     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
00809     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
00810     MOVNTQ(%%mm6, 8(dst))\
00811 \
00812     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
00813     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
00814     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
00815     MOVNTQ(%%mm5, 16(dst))\
00816 \
00817     "add         $24, "#dst"    \n\t"\
00818 \
00819     "add          $8, "#index"  \n\t"\
00820     "cmp     "#dstw", "#index"  \n\t"\
00821     " jb          1b            \n\t"
00822 
00823 #define WRITEBGR24MMX2(dst, dstw, index) \
00824     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00825     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
00826     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
00827     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
00828     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
00829     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
00830 \
00831     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
00832     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
00833     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
00834 \
00835     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
00836     "por    %%mm1, %%mm6        \n\t"\
00837     "por    %%mm3, %%mm6        \n\t"\
00838     MOVNTQ(%%mm6, (dst))\
00839 \
00840     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
00841     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
00842     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
00843     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
00844 \
00845     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
00846     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
00847     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
00848 \
00849     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
00850     "por    %%mm3, %%mm6        \n\t"\
00851     MOVNTQ(%%mm6, 8(dst))\
00852 \
00853     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
00854     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
00855     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
00856 \
00857     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
00858     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
00859     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
00860 \
00861     "por    %%mm1, %%mm3        \n\t"\
00862     "por    %%mm3, %%mm6        \n\t"\
00863     MOVNTQ(%%mm6, 16(dst))\
00864 \
00865     "add      $24, "#dst"       \n\t"\
00866 \
00867     "add       $8, "#index"     \n\t"\
00868     "cmp  "#dstw", "#index"     \n\t"\
00869     " jb       1b               \n\t"
00870 
00871 #if COMPILE_TEMPLATE_MMX2
00872 #undef WRITEBGR24
00873 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
00874 #else
00875 #undef WRITEBGR24
00876 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
00877 #endif
00878 
00879 #define REAL_WRITEYUY2(dst, dstw, index) \
00880     "packuswb  %%mm3, %%mm3     \n\t"\
00881     "packuswb  %%mm4, %%mm4     \n\t"\
00882     "packuswb  %%mm7, %%mm1     \n\t"\
00883     "punpcklbw %%mm4, %%mm3     \n\t"\
00884     "movq      %%mm1, %%mm7     \n\t"\
00885     "punpcklbw %%mm3, %%mm1     \n\t"\
00886     "punpckhbw %%mm3, %%mm7     \n\t"\
00887 \
00888     MOVNTQ(%%mm1, (dst, index, 2))\
00889     MOVNTQ(%%mm7, 8(dst, index, 2))\
00890 \
00891     "add          $8, "#index"  \n\t"\
00892     "cmp     "#dstw", "#index"  \n\t"\
00893     " jb          1b            \n\t"
00894 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
00895 
00896 
00897 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
00898                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
00899                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
00900 {
00901 #if COMPILE_TEMPLATE_MMX
00902     if(!(c->flags & SWS_BITEXACT)) {
00903         if (c->flags & SWS_ACCURATE_RND) {
00904             if (uDest) {
00905                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
00906                 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
00907             }
00908             if (CONFIG_SWSCALE_ALPHA && aDest) {
00909                 YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
00910             }
00911 
00912             YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
00913         } else {
00914             if (uDest) {
00915                 YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
00916                 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
00917             }
00918             if (CONFIG_SWSCALE_ALPHA && aDest) {
00919                 YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
00920             }
00921 
00922             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
00923         }
00924         return;
00925     }
00926 #endif
00927 #if COMPILE_TEMPLATE_ALTIVEC
00928     yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
00929                           chrFilter, chrSrc, chrFilterSize,
00930                           dest, uDest, vDest, dstW, chrDstW);
00931 #else //COMPILE_TEMPLATE_ALTIVEC
00932     yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
00933                 chrFilter, chrSrc, chrFilterSize,
00934                 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
00935 #endif //!COMPILE_TEMPLATE_ALTIVEC
00936 }
00937 
00938 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
00939                                      const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
00940                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
00941 {
00942     yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
00943                  chrFilter, chrSrc, chrFilterSize,
00944                  dest, uDest, dstW, chrDstW, dstFormat);
00945 }
00946 
00947 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
00948                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
00949 {
00950     int i;
00951 #if COMPILE_TEMPLATE_MMX
00952     if(!(c->flags & SWS_BITEXACT)) {
00953         long p= 4;
00954         const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
00955         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
00956         x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
00957 
00958         if (c->flags & SWS_ACCURATE_RND) {
00959             while(p--) {
00960                 if (dst[p]) {
00961                     __asm__ volatile(
00962                         YSCALEYUV2YV121_ACCURATE
00963                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
00964                         "g" (-counter[p])
00965                         : "%"REG_a
00966                     );
00967                 }
00968             }
00969         } else {
00970             while(p--) {
00971                 if (dst[p]) {
00972                     __asm__ volatile(
00973                         YSCALEYUV2YV121
00974                         :: "r" (src[p]), "r" (dst[p] + counter[p]),
00975                         "g" (-counter[p])
00976                         : "%"REG_a
00977                     );
00978                 }
00979             }
00980         }
00981         return;
00982     }
00983 #endif
00984     for (i=0; i<dstW; i++) {
00985         int val= (lumSrc[i]+64)>>7;
00986 
00987         if (val&256) {
00988             if (val<0) val=0;
00989             else       val=255;
00990         }
00991 
00992         dest[i]= val;
00993     }
00994 
00995     if (uDest)
00996         for (i=0; i<chrDstW; i++) {
00997             int u=(chrSrc[i       ]+64)>>7;
00998             int v=(chrSrc[i + VOFW]+64)>>7;
00999 
01000             if ((u|v)&256) {
01001                 if (u<0)        u=0;
01002                 else if (u>255) u=255;
01003                 if (v<0)        v=0;
01004                 else if (v>255) v=255;
01005             }
01006 
01007             uDest[i]= u;
01008             vDest[i]= v;
01009         }
01010 
01011     if (CONFIG_SWSCALE_ALPHA && aDest)
01012         for (i=0; i<dstW; i++) {
01013             int val= (alpSrc[i]+64)>>7;
01014             aDest[i]= av_clip_uint8(val);
01015         }
01016 }
01017 
01018 
01022 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
01023                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
01024                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
01025 {
01026 #if COMPILE_TEMPLATE_MMX
01027     x86_reg dummy=0;
01028     if(!(c->flags & SWS_BITEXACT)) {
01029         if (c->flags & SWS_ACCURATE_RND) {
01030             switch(c->dstFormat) {
01031             case PIX_FMT_RGB32:
01032                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01033                     YSCALEYUV2PACKEDX_ACCURATE
01034                     YSCALEYUV2RGBX
01035                     "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
01036                     "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
01037                     "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
01038                     YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
01039                     "movq               "Y_TEMP"(%0), %%mm5         \n\t"
01040                     "psraw                        $3, %%mm1         \n\t"
01041                     "psraw                        $3, %%mm7         \n\t"
01042                     "packuswb                  %%mm7, %%mm1         \n\t"
01043                     WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
01044 
01045                     YSCALEYUV2PACKEDX_END
01046                 } else {
01047                     YSCALEYUV2PACKEDX_ACCURATE
01048                     YSCALEYUV2RGBX
01049                     "pcmpeqd %%mm7, %%mm7 \n\t"
01050                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01051 
01052                     YSCALEYUV2PACKEDX_END
01053                 }
01054                 return;
01055             case PIX_FMT_BGR24:
01056                 YSCALEYUV2PACKEDX_ACCURATE
01057                 YSCALEYUV2RGBX
01058                 "pxor %%mm7, %%mm7 \n\t"
01059                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
01060                 "add %4, %%"REG_c"                        \n\t"
01061                 WRITEBGR24(%%REGc, %5, %%REGa)
01062 
01063 
01064                 :: "r" (&c->redDither),
01065                 "m" (dummy), "m" (dummy), "m" (dummy),
01066                 "r" (dest), "m" (dstW)
01067                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
01068                 );
01069                 return;
01070             case PIX_FMT_RGB555:
01071                 YSCALEYUV2PACKEDX_ACCURATE
01072                 YSCALEYUV2RGBX
01073                 "pxor %%mm7, %%mm7 \n\t"
01074                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01075 #ifdef DITHER1XBPP
01076                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
01077                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
01078                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
01079 #endif
01080 
01081                 WRITERGB15(%4, %5, %%REGa)
01082                 YSCALEYUV2PACKEDX_END
01083                 return;
01084             case PIX_FMT_RGB565:
01085                 YSCALEYUV2PACKEDX_ACCURATE
01086                 YSCALEYUV2RGBX
01087                 "pxor %%mm7, %%mm7 \n\t"
01088                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01089 #ifdef DITHER1XBPP
01090                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
01091                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
01092                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
01093 #endif
01094 
01095                 WRITERGB16(%4, %5, %%REGa)
01096                 YSCALEYUV2PACKEDX_END
01097                 return;
01098             case PIX_FMT_YUYV422:
01099                 YSCALEYUV2PACKEDX_ACCURATE
01100                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01101 
01102                 "psraw $3, %%mm3    \n\t"
01103                 "psraw $3, %%mm4    \n\t"
01104                 "psraw $3, %%mm1    \n\t"
01105                 "psraw $3, %%mm7    \n\t"
01106                 WRITEYUY2(%4, %5, %%REGa)
01107                 YSCALEYUV2PACKEDX_END
01108                 return;
01109             }
01110         } else {
01111             switch(c->dstFormat) {
01112             case PIX_FMT_RGB32:
01113                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01114                     YSCALEYUV2PACKEDX
01115                     YSCALEYUV2RGBX
01116                     YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
01117                     "psraw                        $3, %%mm1         \n\t"
01118                     "psraw                        $3, %%mm7         \n\t"
01119                     "packuswb                  %%mm7, %%mm1         \n\t"
01120                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01121                     YSCALEYUV2PACKEDX_END
01122                 } else {
01123                     YSCALEYUV2PACKEDX
01124                     YSCALEYUV2RGBX
01125                     "pcmpeqd %%mm7, %%mm7 \n\t"
01126                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01127                     YSCALEYUV2PACKEDX_END
01128                 }
01129                 return;
01130             case PIX_FMT_BGR24:
01131                 YSCALEYUV2PACKEDX
01132                 YSCALEYUV2RGBX
01133                 "pxor                    %%mm7, %%mm7       \n\t"
01134                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
01135                 "add                        %4, %%"REG_c"   \n\t"
01136                 WRITEBGR24(%%REGc, %5, %%REGa)
01137 
01138                 :: "r" (&c->redDither),
01139                 "m" (dummy), "m" (dummy), "m" (dummy),
01140                 "r" (dest),  "m" (dstW)
01141                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
01142                 );
01143                 return;
01144             case PIX_FMT_RGB555:
01145                 YSCALEYUV2PACKEDX
01146                 YSCALEYUV2RGBX
01147                 "pxor %%mm7, %%mm7 \n\t"
01148                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01149 #ifdef DITHER1XBPP
01150                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
01151                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
01152                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
01153 #endif
01154 
01155                 WRITERGB15(%4, %5, %%REGa)
01156                 YSCALEYUV2PACKEDX_END
01157                 return;
01158             case PIX_FMT_RGB565:
01159                 YSCALEYUV2PACKEDX
01160                 YSCALEYUV2RGBX
01161                 "pxor %%mm7, %%mm7 \n\t"
01162                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01163 #ifdef DITHER1XBPP
01164                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
01165                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
01166                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
01167 #endif
01168 
01169                 WRITERGB16(%4, %5, %%REGa)
01170                 YSCALEYUV2PACKEDX_END
01171                 return;
01172             case PIX_FMT_YUYV422:
01173                 YSCALEYUV2PACKEDX
01174                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01175 
01176                 "psraw $3, %%mm3    \n\t"
01177                 "psraw $3, %%mm4    \n\t"
01178                 "psraw $3, %%mm1    \n\t"
01179                 "psraw $3, %%mm7    \n\t"
01180                 WRITEYUY2(%4, %5, %%REGa)
01181                 YSCALEYUV2PACKEDX_END
01182                 return;
01183             }
01184         }
01185     }
01186 #endif /* COMPILE_TEMPLATE_MMX */
01187 #if COMPILE_TEMPLATE_ALTIVEC
01188     /* The following list of supported dstFormat values should
01189        match what's found in the body of ff_yuv2packedX_altivec() */
01190     if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
01191          (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
01192           c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
01193           c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
01194             ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
01195                                    chrFilter, chrSrc, chrFilterSize,
01196                                    dest, dstW, dstY);
01197     else
01198 #endif
01199         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
01200                        chrFilter, chrSrc, chrFilterSize,
01201                        alpSrc, dest, dstW, dstY);
01202 }
01203 
01207 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
01208                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
01209 {
01210     int  yalpha1=4095- yalpha;
01211     int uvalpha1=4095-uvalpha;
01212     int i;
01213 
01214 #if COMPILE_TEMPLATE_MMX
01215     if(!(c->flags & SWS_BITEXACT)) {
01216         switch(c->dstFormat) {
01217         //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
01218         case PIX_FMT_RGB32:
01219             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01220 #if ARCH_X86_64
01221                 __asm__ volatile(
01222                     YSCALEYUV2RGB(%%r8, %5)
01223                     YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
01224                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
01225                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
01226                     "packuswb            %%mm7, %%mm1       \n\t"
01227                     WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01228 
01229                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
01230                     "a" (&c->redDither)
01231                     ,"r" (abuf0), "r" (abuf1)
01232                     : "%r8"
01233                 );
01234 #else
01235                 *(const uint16_t **)(&c->u_temp)=abuf0;
01236                 *(const uint16_t **)(&c->v_temp)=abuf1;
01237                 __asm__ volatile(
01238                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01239                     "mov        %4, %%"REG_b"               \n\t"
01240                     "push %%"REG_BP"                        \n\t"
01241                     YSCALEYUV2RGB(%%REGBP, %5)
01242                     "push                   %0              \n\t"
01243                     "push                   %1              \n\t"
01244                     "mov          "U_TEMP"(%5), %0          \n\t"
01245                     "mov          "V_TEMP"(%5), %1          \n\t"
01246                     YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
01247                     "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
01248                     "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
01249                     "packuswb            %%mm7, %%mm1       \n\t"
01250                     "pop                    %1              \n\t"
01251                     "pop                    %0              \n\t"
01252                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01253                     "pop %%"REG_BP"                         \n\t"
01254                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01255 
01256                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01257                     "a" (&c->redDither)
01258                 );
01259 #endif
01260             } else {
01261                 __asm__ volatile(
01262                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01263                     "mov        %4, %%"REG_b"               \n\t"
01264                     "push %%"REG_BP"                        \n\t"
01265                     YSCALEYUV2RGB(%%REGBP, %5)
01266                     "pcmpeqd %%mm7, %%mm7                   \n\t"
01267                     WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01268                     "pop %%"REG_BP"                         \n\t"
01269                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01270 
01271                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01272                     "a" (&c->redDither)
01273                 );
01274             }
01275             return;
01276         case PIX_FMT_BGR24:
01277             __asm__ volatile(
01278                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01279                 "mov        %4, %%"REG_b"               \n\t"
01280                 "push %%"REG_BP"                        \n\t"
01281                 YSCALEYUV2RGB(%%REGBP, %5)
01282                 "pxor    %%mm7, %%mm7                   \n\t"
01283                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01284                 "pop %%"REG_BP"                         \n\t"
01285                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01286                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01287                 "a" (&c->redDither)
01288             );
01289             return;
01290         case PIX_FMT_RGB555:
01291             __asm__ volatile(
01292                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01293                 "mov        %4, %%"REG_b"               \n\t"
01294                 "push %%"REG_BP"                        \n\t"
01295                 YSCALEYUV2RGB(%%REGBP, %5)
01296                 "pxor    %%mm7, %%mm7                   \n\t"
01297                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01298 #ifdef DITHER1XBPP
01299                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01300                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01301                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01302 #endif
01303 
01304                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01305                 "pop %%"REG_BP"                         \n\t"
01306                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01307 
01308                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01309                 "a" (&c->redDither)
01310             );
01311             return;
01312         case PIX_FMT_RGB565:
01313             __asm__ volatile(
01314                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01315                 "mov        %4, %%"REG_b"               \n\t"
01316                 "push %%"REG_BP"                        \n\t"
01317                 YSCALEYUV2RGB(%%REGBP, %5)
01318                 "pxor    %%mm7, %%mm7                   \n\t"
01319                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01320 #ifdef DITHER1XBPP
01321                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01322                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01323                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01324 #endif
01325 
01326                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01327                 "pop %%"REG_BP"                         \n\t"
01328                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01329                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01330                 "a" (&c->redDither)
01331             );
01332             return;
01333         case PIX_FMT_YUYV422:
01334             __asm__ volatile(
01335                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01336                 "mov %4, %%"REG_b"                        \n\t"
01337                 "push %%"REG_BP"                        \n\t"
01338                 YSCALEYUV2PACKED(%%REGBP, %5)
01339                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01340                 "pop %%"REG_BP"                         \n\t"
01341                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01342                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01343                 "a" (&c->redDither)
01344             );
01345             return;
01346         default: break;
01347         }
01348     }
01349 #endif //COMPILE_TEMPLATE_MMX
01350     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
01351 }
01352 
01356 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
01357                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
01358 {
01359     const int yalpha1=0;
01360     int i;
01361 
01362     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01363     const int yalpha= 4096; //FIXME ...
01364 
01365     if (flags&SWS_FULL_CHR_H_INT) {
01366         c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
01367         return;
01368     }
01369 
01370 #if COMPILE_TEMPLATE_MMX
01371     if(!(flags & SWS_BITEXACT)) {
01372         if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01373             switch(dstFormat) {
01374             case PIX_FMT_RGB32:
01375                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01376                     __asm__ volatile(
01377                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01378                         "mov        %4, %%"REG_b"               \n\t"
01379                         "push %%"REG_BP"                        \n\t"
01380                         YSCALEYUV2RGB1(%%REGBP, %5)
01381                         YSCALEYUV2RGB1_ALPHA(%%REGBP)
01382                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01383                         "pop %%"REG_BP"                         \n\t"
01384                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01385 
01386                         :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01387                         "a" (&c->redDither)
01388                     );
01389                 } else {
01390                     __asm__ volatile(
01391                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01392                         "mov        %4, %%"REG_b"               \n\t"
01393                         "push %%"REG_BP"                        \n\t"
01394                         YSCALEYUV2RGB1(%%REGBP, %5)
01395                         "pcmpeqd %%mm7, %%mm7                   \n\t"
01396                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01397                         "pop %%"REG_BP"                         \n\t"
01398                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01399 
01400                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01401                         "a" (&c->redDither)
01402                     );
01403                 }
01404                 return;
01405             case PIX_FMT_BGR24:
01406                 __asm__ volatile(
01407                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01408                     "mov        %4, %%"REG_b"               \n\t"
01409                     "push %%"REG_BP"                        \n\t"
01410                     YSCALEYUV2RGB1(%%REGBP, %5)
01411                     "pxor    %%mm7, %%mm7                   \n\t"
01412                     WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01413                     "pop %%"REG_BP"                         \n\t"
01414                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01415 
01416                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01417                     "a" (&c->redDither)
01418                 );
01419                 return;
01420             case PIX_FMT_RGB555:
01421                 __asm__ volatile(
01422                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01423                     "mov        %4, %%"REG_b"               \n\t"
01424                     "push %%"REG_BP"                        \n\t"
01425                     YSCALEYUV2RGB1(%%REGBP, %5)
01426                     "pxor    %%mm7, %%mm7                   \n\t"
01427                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01428 #ifdef DITHER1XBPP
01429                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01430                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01431                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01432 #endif
01433                     WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01434                     "pop %%"REG_BP"                         \n\t"
01435                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01436 
01437                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01438                     "a" (&c->redDither)
01439                 );
01440                 return;
01441             case PIX_FMT_RGB565:
01442                 __asm__ volatile(
01443                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01444                     "mov        %4, %%"REG_b"               \n\t"
01445                     "push %%"REG_BP"                        \n\t"
01446                     YSCALEYUV2RGB1(%%REGBP, %5)
01447                     "pxor    %%mm7, %%mm7                   \n\t"
01448                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01449 #ifdef DITHER1XBPP
01450                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01451                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01452                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01453 #endif
01454 
01455                     WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01456                     "pop %%"REG_BP"                         \n\t"
01457                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01458 
01459                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01460                     "a" (&c->redDither)
01461                 );
01462                 return;
01463             case PIX_FMT_YUYV422:
01464                 __asm__ volatile(
01465                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01466                     "mov        %4, %%"REG_b"               \n\t"
01467                     "push %%"REG_BP"                        \n\t"
01468                     YSCALEYUV2PACKED1(%%REGBP, %5)
01469                     WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01470                     "pop %%"REG_BP"                         \n\t"
01471                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01472 
01473                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01474                     "a" (&c->redDither)
01475                 );
01476                 return;
01477             }
01478         } else {
01479             switch(dstFormat) {
01480             case PIX_FMT_RGB32:
01481                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01482                     __asm__ volatile(
01483                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01484                         "mov        %4, %%"REG_b"               \n\t"
01485                         "push %%"REG_BP"                        \n\t"
01486                         YSCALEYUV2RGB1b(%%REGBP, %5)
01487                         YSCALEYUV2RGB1_ALPHA(%%REGBP)
01488                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01489                         "pop %%"REG_BP"                         \n\t"
01490                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01491 
01492                         :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01493                         "a" (&c->redDither)
01494                     );
01495                 } else {
01496                     __asm__ volatile(
01497                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01498                         "mov        %4, %%"REG_b"               \n\t"
01499                         "push %%"REG_BP"                        \n\t"
01500                         YSCALEYUV2RGB1b(%%REGBP, %5)
01501                         "pcmpeqd %%mm7, %%mm7                   \n\t"
01502                         WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01503                         "pop %%"REG_BP"                         \n\t"
01504                         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01505 
01506                         :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01507                         "a" (&c->redDither)
01508                     );
01509                 }
01510                 return;
01511             case PIX_FMT_BGR24:
01512                 __asm__ volatile(
01513                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01514                     "mov        %4, %%"REG_b"               \n\t"
01515                     "push %%"REG_BP"                        \n\t"
01516                     YSCALEYUV2RGB1b(%%REGBP, %5)
01517                     "pxor    %%mm7, %%mm7                   \n\t"
01518                     WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01519                     "pop %%"REG_BP"                         \n\t"
01520                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01521 
01522                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01523                     "a" (&c->redDither)
01524                 );
01525                 return;
01526             case PIX_FMT_RGB555:
01527                 __asm__ volatile(
01528                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01529                     "mov        %4, %%"REG_b"               \n\t"
01530                     "push %%"REG_BP"                        \n\t"
01531                     YSCALEYUV2RGB1b(%%REGBP, %5)
01532                     "pxor    %%mm7, %%mm7                   \n\t"
01533                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01534 #ifdef DITHER1XBPP
01535                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01536                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01537                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01538 #endif
01539                     WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01540                     "pop %%"REG_BP"                         \n\t"
01541                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01542 
01543                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01544                     "a" (&c->redDither)
01545                 );
01546                 return;
01547             case PIX_FMT_RGB565:
01548                 __asm__ volatile(
01549                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01550                     "mov        %4, %%"REG_b"               \n\t"
01551                     "push %%"REG_BP"                        \n\t"
01552                     YSCALEYUV2RGB1b(%%REGBP, %5)
01553                     "pxor    %%mm7, %%mm7                   \n\t"
01554                     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01555 #ifdef DITHER1XBPP
01556                     "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01557                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01558                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01559 #endif
01560 
01561                     WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01562                     "pop %%"REG_BP"                         \n\t"
01563                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01564 
01565                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01566                     "a" (&c->redDither)
01567                 );
01568                 return;
01569             case PIX_FMT_YUYV422:
01570                 __asm__ volatile(
01571                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01572                     "mov        %4, %%"REG_b"               \n\t"
01573                     "push %%"REG_BP"                        \n\t"
01574                     YSCALEYUV2PACKED1b(%%REGBP, %5)
01575                     WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01576                     "pop %%"REG_BP"                         \n\t"
01577                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01578 
01579                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01580                     "a" (&c->redDither)
01581                 );
01582                 return;
01583             }
01584         }
01585     }
01586 #endif /* COMPILE_TEMPLATE_MMX */
01587     if (uvalpha < 2048) {
01588         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
01589     } else {
01590         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
01591     }
01592 }
01593 
01594 //FIXME yuy2* can read up to 7 samples too much
01595 
01596 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01597 {
01598 #if COMPILE_TEMPLATE_MMX
01599     __asm__ volatile(
01600         "movq "MANGLE(bm01010101)", %%mm2           \n\t"
01601         "mov                    %0, %%"REG_a"       \n\t"
01602         "1:                                         \n\t"
01603         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
01604         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
01605         "pand                %%mm2, %%mm0           \n\t"
01606         "pand                %%mm2, %%mm1           \n\t"
01607         "packuswb            %%mm1, %%mm0           \n\t"
01608         "movq                %%mm0, (%2, %%"REG_a") \n\t"
01609         "add                    $8, %%"REG_a"       \n\t"
01610         " js                    1b                  \n\t"
01611         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
01612         : "%"REG_a
01613     );
01614 #else
01615     int i;
01616     for (i=0; i<width; i++)
01617         dst[i]= src[2*i];
01618 #endif
01619 }
01620 
01621 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01622 {
01623 #if COMPILE_TEMPLATE_MMX
01624     __asm__ volatile(
01625         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
01626         "mov                    %0, %%"REG_a"       \n\t"
01627         "1:                                         \n\t"
01628         "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
01629         "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
01630         "psrlw                  $8, %%mm0           \n\t"
01631         "psrlw                  $8, %%mm1           \n\t"
01632         "packuswb            %%mm1, %%mm0           \n\t"
01633         "movq                %%mm0, %%mm1           \n\t"
01634         "psrlw                  $8, %%mm0           \n\t"
01635         "pand                %%mm4, %%mm1           \n\t"
01636         "packuswb            %%mm0, %%mm0           \n\t"
01637         "packuswb            %%mm1, %%mm1           \n\t"
01638         "movd                %%mm0, (%3, %%"REG_a") \n\t"
01639         "movd                %%mm1, (%2, %%"REG_a") \n\t"
01640         "add                    $4, %%"REG_a"       \n\t"
01641         " js                    1b                  \n\t"
01642         : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01643         : "%"REG_a
01644     );
01645 #else
01646     int i;
01647     for (i=0; i<width; i++) {
01648         dstU[i]= src1[4*i + 1];
01649         dstV[i]= src1[4*i + 3];
01650     }
01651 #endif
01652     assert(src1 == src2);
01653 }
01654 
01655 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01656 {
01657 #if COMPILE_TEMPLATE_MMX
01658     __asm__ volatile(
01659         "mov                    %0, %%"REG_a"       \n\t"
01660         "1:                                         \n\t"
01661         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
01662         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
01663         "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
01664         "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
01665         "psrlw                  $8, %%mm0           \n\t"
01666         "psrlw                  $8, %%mm1           \n\t"
01667         "psrlw                  $8, %%mm2           \n\t"
01668         "psrlw                  $8, %%mm3           \n\t"
01669         "packuswb            %%mm1, %%mm0           \n\t"
01670         "packuswb            %%mm3, %%mm2           \n\t"
01671         "movq                %%mm0, (%3, %%"REG_a") \n\t"
01672         "movq                %%mm2, (%4, %%"REG_a") \n\t"
01673         "add                    $8, %%"REG_a"       \n\t"
01674         " js                    1b                  \n\t"
01675         : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
01676         : "%"REG_a
01677     );
01678 #else
01679     int i;
01680     for (i=0; i<width; i++) {
01681         dstU[i]= src1[2*i + 1];
01682         dstV[i]= src2[2*i + 1];
01683     }
01684 #endif
01685 }
01686 
01687 /* This is almost identical to the previous, end exists only because
01688  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
01689 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01690 {
01691 #if COMPILE_TEMPLATE_MMX
01692     __asm__ volatile(
01693         "mov                  %0, %%"REG_a"         \n\t"
01694         "1:                                         \n\t"
01695         "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
01696         "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
01697         "psrlw                $8, %%mm0             \n\t"
01698         "psrlw                $8, %%mm1             \n\t"
01699         "packuswb          %%mm1, %%mm0             \n\t"
01700         "movq              %%mm0, (%2, %%"REG_a")   \n\t"
01701         "add                  $8, %%"REG_a"         \n\t"
01702         " js                  1b                    \n\t"
01703         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
01704         : "%"REG_a
01705     );
01706 #else
01707     int i;
01708     for (i=0; i<width; i++)
01709         dst[i]= src[2*i+1];
01710 #endif
01711 }
01712 
01713 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01714 {
01715 #if COMPILE_TEMPLATE_MMX
01716     __asm__ volatile(
01717         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
01718         "mov                    %0, %%"REG_a"       \n\t"
01719         "1:                                         \n\t"
01720         "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
01721         "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
01722         "pand                %%mm4, %%mm0           \n\t"
01723         "pand                %%mm4, %%mm1           \n\t"
01724         "packuswb            %%mm1, %%mm0           \n\t"
01725         "movq                %%mm0, %%mm1           \n\t"
01726         "psrlw                  $8, %%mm0           \n\t"
01727         "pand                %%mm4, %%mm1           \n\t"
01728         "packuswb            %%mm0, %%mm0           \n\t"
01729         "packuswb            %%mm1, %%mm1           \n\t"
01730         "movd                %%mm0, (%3, %%"REG_a") \n\t"
01731         "movd                %%mm1, (%2, %%"REG_a") \n\t"
01732         "add                    $4, %%"REG_a"       \n\t"
01733         " js                    1b                  \n\t"
01734         : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01735         : "%"REG_a
01736     );
01737 #else
01738     int i;
01739     for (i=0; i<width; i++) {
01740         dstU[i]= src1[4*i + 0];
01741         dstV[i]= src1[4*i + 2];
01742     }
01743 #endif
01744     assert(src1 == src2);
01745 }
01746 
01747 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01748 {
01749 #if COMPILE_TEMPLATE_MMX
01750     __asm__ volatile(
01751         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
01752         "mov                    %0, %%"REG_a"       \n\t"
01753         "1:                                         \n\t"
01754         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
01755         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
01756         "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
01757         "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
01758         "pand                %%mm4, %%mm0           \n\t"
01759         "pand                %%mm4, %%mm1           \n\t"
01760         "pand                %%mm4, %%mm2           \n\t"
01761         "pand                %%mm4, %%mm3           \n\t"
01762         "packuswb            %%mm1, %%mm0           \n\t"
01763         "packuswb            %%mm3, %%mm2           \n\t"
01764         "movq                %%mm0, (%3, %%"REG_a") \n\t"
01765         "movq                %%mm2, (%4, %%"REG_a") \n\t"
01766         "add                    $8, %%"REG_a"       \n\t"
01767         " js                    1b                  \n\t"
01768         : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
01769         : "%"REG_a
01770     );
01771 #else
01772     int i;
01773     for (i=0; i<width; i++) {
01774         dstU[i]= src1[2*i];
01775         dstV[i]= src2[2*i];
01776     }
01777 #endif
01778 }
01779 
01780 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
01781                                     const uint8_t *src, long width)
01782 {
01783 #if COMPILE_TEMPLATE_MMX
01784     __asm__ volatile(
01785         "movq "MANGLE(bm01010101)", %%mm4           \n\t"
01786         "mov                    %0, %%"REG_a"       \n\t"
01787         "1:                                         \n\t"
01788         "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
01789         "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
01790         "movq                %%mm0, %%mm2           \n\t"
01791         "movq                %%mm1, %%mm3           \n\t"
01792         "pand                %%mm4, %%mm0           \n\t"
01793         "pand                %%mm4, %%mm1           \n\t"
01794         "psrlw                  $8, %%mm2           \n\t"
01795         "psrlw                  $8, %%mm3           \n\t"
01796         "packuswb            %%mm1, %%mm0           \n\t"
01797         "packuswb            %%mm3, %%mm2           \n\t"
01798         "movq                %%mm0, (%2, %%"REG_a") \n\t"
01799         "movq                %%mm2, (%3, %%"REG_a") \n\t"
01800         "add                    $8, %%"REG_a"       \n\t"
01801         " js                    1b                  \n\t"
01802         : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
01803         : "%"REG_a
01804     );
01805 #else
01806     int i;
01807     for (i = 0; i < width; i++) {
01808         dst1[i] = src[2*i+0];
01809         dst2[i] = src[2*i+1];
01810     }
01811 #endif
01812 }
01813 
01814 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
01815                                     const uint8_t *src1, const uint8_t *src2,
01816                                     long width, uint32_t *unused)
01817 {
01818     RENAME(nvXXtoUV)(dstU, dstV, src1, width);
01819 }
01820 
01821 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
01822                                     const uint8_t *src1, const uint8_t *src2,
01823                                     long width, uint32_t *unused)
01824 {
01825     RENAME(nvXXtoUV)(dstV, dstU, src1, width);
01826 }
01827 
01828 #if COMPILE_TEMPLATE_MMX
01829 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
01830 {
01831 
01832     if(srcFormat == PIX_FMT_BGR24) {
01833         __asm__ volatile(
01834             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
01835             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
01836             :
01837         );
01838     } else {
01839         __asm__ volatile(
01840             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
01841             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
01842             :
01843         );
01844     }
01845 
01846     __asm__ volatile(
01847         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
01848         "mov                        %2, %%"REG_a"   \n\t"
01849         "pxor                    %%mm7, %%mm7       \n\t"
01850         "1:                                         \n\t"
01851         PREFETCH"               64(%0)              \n\t"
01852         "movd                     (%0), %%mm0       \n\t"
01853         "movd                    2(%0), %%mm1       \n\t"
01854         "movd                    6(%0), %%mm2       \n\t"
01855         "movd                    8(%0), %%mm3       \n\t"
01856         "add                       $12, %0          \n\t"
01857         "punpcklbw               %%mm7, %%mm0       \n\t"
01858         "punpcklbw               %%mm7, %%mm1       \n\t"
01859         "punpcklbw               %%mm7, %%mm2       \n\t"
01860         "punpcklbw               %%mm7, %%mm3       \n\t"
01861         "pmaddwd                 %%mm5, %%mm0       \n\t"
01862         "pmaddwd                 %%mm6, %%mm1       \n\t"
01863         "pmaddwd                 %%mm5, %%mm2       \n\t"
01864         "pmaddwd                 %%mm6, %%mm3       \n\t"
01865         "paddd                   %%mm1, %%mm0       \n\t"
01866         "paddd                   %%mm3, %%mm2       \n\t"
01867         "paddd                   %%mm4, %%mm0       \n\t"
01868         "paddd                   %%mm4, %%mm2       \n\t"
01869         "psrad                     $15, %%mm0       \n\t"
01870         "psrad                     $15, %%mm2       \n\t"
01871         "packssdw                %%mm2, %%mm0       \n\t"
01872         "packuswb                %%mm0, %%mm0       \n\t"
01873         "movd                %%mm0, (%1, %%"REG_a") \n\t"
01874         "add                        $4, %%"REG_a"   \n\t"
01875         " js                        1b              \n\t"
01876     : "+r" (src)
01877     : "r" (dst+width), "g" ((x86_reg)-width)
01878     : "%"REG_a
01879     );
01880 }
01881 
01882 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
01883 {
01884     __asm__ volatile(
01885         "movq                    24+%4, %%mm6       \n\t"
01886         "mov                        %3, %%"REG_a"   \n\t"
01887         "pxor                    %%mm7, %%mm7       \n\t"
01888         "1:                                         \n\t"
01889         PREFETCH"               64(%0)              \n\t"
01890         "movd                     (%0), %%mm0       \n\t"
01891         "movd                    2(%0), %%mm1       \n\t"
01892         "punpcklbw               %%mm7, %%mm0       \n\t"
01893         "punpcklbw               %%mm7, %%mm1       \n\t"
01894         "movq                    %%mm0, %%mm2       \n\t"
01895         "movq                    %%mm1, %%mm3       \n\t"
01896         "pmaddwd                    %4, %%mm0       \n\t"
01897         "pmaddwd                  8+%4, %%mm1       \n\t"
01898         "pmaddwd                 16+%4, %%mm2       \n\t"
01899         "pmaddwd                 %%mm6, %%mm3       \n\t"
01900         "paddd                   %%mm1, %%mm0       \n\t"
01901         "paddd                   %%mm3, %%mm2       \n\t"
01902 
01903         "movd                    6(%0), %%mm1       \n\t"
01904         "movd                    8(%0), %%mm3       \n\t"
01905         "add                       $12, %0          \n\t"
01906         "punpcklbw               %%mm7, %%mm1       \n\t"
01907         "punpcklbw               %%mm7, %%mm3       \n\t"
01908         "movq                    %%mm1, %%mm4       \n\t"
01909         "movq                    %%mm3, %%mm5       \n\t"
01910         "pmaddwd                    %4, %%mm1       \n\t"
01911         "pmaddwd                  8+%4, %%mm3       \n\t"
01912         "pmaddwd                 16+%4, %%mm4       \n\t"
01913         "pmaddwd                 %%mm6, %%mm5       \n\t"
01914         "paddd                   %%mm3, %%mm1       \n\t"
01915         "paddd                   %%mm5, %%mm4       \n\t"
01916 
01917         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
01918         "paddd                   %%mm3, %%mm0       \n\t"
01919         "paddd                   %%mm3, %%mm2       \n\t"
01920         "paddd                   %%mm3, %%mm1       \n\t"
01921         "paddd                   %%mm3, %%mm4       \n\t"
01922         "psrad                     $15, %%mm0       \n\t"
01923         "psrad                     $15, %%mm2       \n\t"
01924         "psrad                     $15, %%mm1       \n\t"
01925         "psrad                     $15, %%mm4       \n\t"
01926         "packssdw                %%mm1, %%mm0       \n\t"
01927         "packssdw                %%mm4, %%mm2       \n\t"
01928         "packuswb                %%mm0, %%mm0       \n\t"
01929         "packuswb                %%mm2, %%mm2       \n\t"
01930         "movd                %%mm0, (%1, %%"REG_a") \n\t"
01931         "movd                %%mm2, (%2, %%"REG_a") \n\t"
01932         "add                        $4, %%"REG_a"   \n\t"
01933         " js                        1b              \n\t"
01934     : "+r" (src)
01935     : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
01936     : "%"REG_a
01937     );
01938 }
01939 #endif
01940 
01941 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01942 {
01943 #if COMPILE_TEMPLATE_MMX
01944     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
01945 #else
01946     int i;
01947     for (i=0; i<width; i++) {
01948         int b= src[i*3+0];
01949         int g= src[i*3+1];
01950         int r= src[i*3+2];
01951 
01952         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
01953     }
01954 #endif /* COMPILE_TEMPLATE_MMX */
01955 }
01956 
01957 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01958 {
01959 #if COMPILE_TEMPLATE_MMX
01960     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
01961 #else
01962     int i;
01963     for (i=0; i<width; i++) {
01964         int b= src1[3*i + 0];
01965         int g= src1[3*i + 1];
01966         int r= src1[3*i + 2];
01967 
01968         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01969         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01970     }
01971 #endif /* COMPILE_TEMPLATE_MMX */
01972     assert(src1 == src2);
01973 }
01974 
01975 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01976 {
01977     int i;
01978     for (i=0; i<width; i++) {
01979         int b= src1[6*i + 0] + src1[6*i + 3];
01980         int g= src1[6*i + 1] + src1[6*i + 4];
01981         int r= src1[6*i + 2] + src1[6*i + 5];
01982 
01983         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01984         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01985     }
01986     assert(src1 == src2);
01987 }
01988 
01989 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01990 {
01991 #if COMPILE_TEMPLATE_MMX
01992     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
01993 #else
01994     int i;
01995     for (i=0; i<width; i++) {
01996         int r= src[i*3+0];
01997         int g= src[i*3+1];
01998         int b= src[i*3+2];
01999 
02000         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
02001     }
02002 #endif
02003 }
02004 
02005 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
02006 {
02007 #if COMPILE_TEMPLATE_MMX
02008     assert(src1==src2);
02009     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
02010 #else
02011     int i;
02012     assert(src1==src2);
02013     for (i=0; i<width; i++) {
02014         int r= src1[3*i + 0];
02015         int g= src1[3*i + 1];
02016         int b= src1[3*i + 2];
02017 
02018         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
02019         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
02020     }
02021 #endif
02022 }
02023 
02024 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
02025 {
02026     int i;
02027     assert(src1==src2);
02028     for (i=0; i<width; i++) {
02029         int r= src1[6*i + 0] + src1[6*i + 3];
02030         int g= src1[6*i + 1] + src1[6*i + 4];
02031         int b= src1[6*i + 2] + src1[6*i + 5];
02032 
02033         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
02034         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
02035     }
02036 }
02037 
02038 
02039 // bilinear / bicubic scaling
02040 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
02041                                   const int16_t *filter, const int16_t *filterPos, long filterSize)
02042 {
02043 #if COMPILE_TEMPLATE_MMX
02044     assert(filterSize % 4 == 0 && filterSize>0);
02045     if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
02046         x86_reg counter= -2*dstW;
02047         filter-= counter*2;
02048         filterPos-= counter/2;
02049         dst-= counter/2;
02050         __asm__ volatile(
02051 #if defined(PIC)
02052             "push            %%"REG_b"              \n\t"
02053 #endif
02054             "pxor                %%mm7, %%mm7       \n\t"
02055             "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
02056             "mov             %%"REG_a", %%"REG_BP"  \n\t"
02057             ASMALIGN(4)
02058             "1:                                     \n\t"
02059             "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
02060             "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
02061             "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
02062             "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
02063             "movd      (%3, %%"REG_a"), %%mm0       \n\t"
02064             "movd      (%3, %%"REG_b"), %%mm2       \n\t"
02065             "punpcklbw           %%mm7, %%mm0       \n\t"
02066             "punpcklbw           %%mm7, %%mm2       \n\t"
02067             "pmaddwd             %%mm1, %%mm0       \n\t"
02068             "pmaddwd             %%mm2, %%mm3       \n\t"
02069             "movq                %%mm0, %%mm4       \n\t"
02070             "punpckldq           %%mm3, %%mm0       \n\t"
02071             "punpckhdq           %%mm3, %%mm4       \n\t"
02072             "paddd               %%mm4, %%mm0       \n\t"
02073             "psrad                  $7, %%mm0       \n\t"
02074             "packssdw            %%mm0, %%mm0       \n\t"
02075             "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
02076             "add                    $4, %%"REG_BP"  \n\t"
02077             " jnc                   1b              \n\t"
02078 
02079             "pop            %%"REG_BP"              \n\t"
02080 #if defined(PIC)
02081             "pop             %%"REG_b"              \n\t"
02082 #endif
02083             : "+a" (counter)
02084             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02085 #if !defined(PIC)
02086             : "%"REG_b
02087 #endif
02088         );
02089     } else if (filterSize==8) {
02090         x86_reg counter= -2*dstW;
02091         filter-= counter*4;
02092         filterPos-= counter/2;
02093         dst-= counter/2;
02094         __asm__ volatile(
02095 #if defined(PIC)
02096             "push             %%"REG_b"             \n\t"
02097 #endif
02098             "pxor                 %%mm7, %%mm7      \n\t"
02099             "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
02100             "mov              %%"REG_a", %%"REG_BP" \n\t"
02101             ASMALIGN(4)
02102             "1:                                     \n\t"
02103             "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
02104             "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
02105             "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
02106             "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
02107             "movd       (%3, %%"REG_a"), %%mm0      \n\t"
02108             "movd       (%3, %%"REG_b"), %%mm2      \n\t"
02109             "punpcklbw            %%mm7, %%mm0      \n\t"
02110             "punpcklbw            %%mm7, %%mm2      \n\t"
02111             "pmaddwd              %%mm1, %%mm0      \n\t"
02112             "pmaddwd              %%mm2, %%mm3      \n\t"
02113 
02114             "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
02115             "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
02116             "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
02117             "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
02118             "punpcklbw            %%mm7, %%mm4      \n\t"
02119             "punpcklbw            %%mm7, %%mm2      \n\t"
02120             "pmaddwd              %%mm1, %%mm4      \n\t"
02121             "pmaddwd              %%mm2, %%mm5      \n\t"
02122             "paddd                %%mm4, %%mm0      \n\t"
02123             "paddd                %%mm5, %%mm3      \n\t"
02124             "movq                 %%mm0, %%mm4      \n\t"
02125             "punpckldq            %%mm3, %%mm0      \n\t"
02126             "punpckhdq            %%mm3, %%mm4      \n\t"
02127             "paddd                %%mm4, %%mm0      \n\t"
02128             "psrad                   $7, %%mm0      \n\t"
02129             "packssdw             %%mm0, %%mm0      \n\t"
02130             "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
02131             "add                     $4, %%"REG_BP" \n\t"
02132             " jnc                    1b             \n\t"
02133 
02134             "pop             %%"REG_BP"             \n\t"
02135 #if defined(PIC)
02136             "pop              %%"REG_b"             \n\t"
02137 #endif
02138             : "+a" (counter)
02139             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02140 #if !defined(PIC)
02141             : "%"REG_b
02142 #endif
02143         );
02144     } else {
02145         const uint8_t *offset = src+filterSize;
02146         x86_reg counter= -2*dstW;
02147         //filter-= counter*filterSize/2;
02148         filterPos-= counter/2;
02149         dst-= counter/2;
02150         __asm__ volatile(
02151             "pxor                  %%mm7, %%mm7     \n\t"
02152             ASMALIGN(4)
02153             "1:                                     \n\t"
02154             "mov                      %2, %%"REG_c" \n\t"
02155             "movzwl      (%%"REG_c", %0), %%eax     \n\t"
02156             "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
02157             "mov                      %5, %%"REG_c" \n\t"
02158             "pxor                  %%mm4, %%mm4     \n\t"
02159             "pxor                  %%mm5, %%mm5     \n\t"
02160             "2:                                     \n\t"
02161             "movq                   (%1), %%mm1     \n\t"
02162             "movq               (%1, %6), %%mm3     \n\t"
02163             "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
02164             "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
02165             "punpcklbw             %%mm7, %%mm0     \n\t"
02166             "punpcklbw             %%mm7, %%mm2     \n\t"
02167             "pmaddwd               %%mm1, %%mm0     \n\t"
02168             "pmaddwd               %%mm2, %%mm3     \n\t"
02169             "paddd                 %%mm3, %%mm5     \n\t"
02170             "paddd                 %%mm0, %%mm4     \n\t"
02171             "add                      $8, %1        \n\t"
02172             "add                      $4, %%"REG_c" \n\t"
02173             "cmp                      %4, %%"REG_c" \n\t"
02174             " jb                      2b            \n\t"
02175             "add                      %6, %1        \n\t"
02176             "movq                  %%mm4, %%mm0     \n\t"
02177             "punpckldq             %%mm5, %%mm4     \n\t"
02178             "punpckhdq             %%mm5, %%mm0     \n\t"
02179             "paddd                 %%mm0, %%mm4     \n\t"
02180             "psrad                    $7, %%mm4     \n\t"
02181             "packssdw              %%mm4, %%mm4     \n\t"
02182             "mov                      %3, %%"REG_a" \n\t"
02183             "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
02184             "add                      $4, %0        \n\t"
02185             " jnc                     1b            \n\t"
02186 
02187             : "+r" (counter), "+r" (filter)
02188             : "m" (filterPos), "m" (dst), "m"(offset),
02189             "m" (src), "r" ((x86_reg)filterSize*2)
02190             : "%"REG_a, "%"REG_c, "%"REG_d
02191         );
02192     }
02193 #else
02194 #if COMPILE_TEMPLATE_ALTIVEC
02195     hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
02196 #else
02197     int i;
02198     for (i=0; i<dstW; i++) {
02199         int j;
02200         int srcPos= filterPos[i];
02201         int val=0;
02202         //printf("filterPos: %d\n", filterPos[i]);
02203         for (j=0; j<filterSize; j++) {
02204             //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
02205             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
02206         }
02207         //filter += hFilterSize;
02208         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
02209         //dst[i] = val>>7;
02210     }
02211 #endif /* COMPILE_TEMPLATE_ALTIVEC */
02212 #endif /* COMPILE_MMX */
02213 }
02214 
02215 //FIXME all pal and rgb srcFormats could do this convertion as well
02216 //FIXME all scalers more complex than bilinear could do half of this transform
02217 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
02218 {
02219     int i;
02220     for (i = 0; i < width; i++) {
02221         dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
02222         dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
02223     }
02224 }
02225 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
02226 {
02227     int i;
02228     for (i = 0; i < width; i++) {
02229         dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
02230         dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
02231     }
02232 }
02233 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
02234 {
02235     int i;
02236     for (i = 0; i < width; i++)
02237         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
02238 }
02239 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
02240 {
02241     int i;
02242     for (i = 0; i < width; i++)
02243         dst[i] = (dst[i]*14071 + 33561947)>>14;
02244 }
02245 
02246 #define FAST_BILINEAR_X86 \
02247     "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
02248     "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
02249     "shll      $16, %%edi    \n\t"                                              \
02250     "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
02251     "mov        %1, %%"REG_D"\n\t"                                              \
02252     "shrl       $9, %%esi    \n\t"                                              \
02253 
02254 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
02255                                         long dstWidth, const uint8_t *src, int srcW,
02256                                         int xInc)
02257 {
02258 #if ARCH_X86
02259 #if COMPILE_TEMPLATE_MMX2
02260     int32_t *filterPos = c->hLumFilterPos;
02261     int16_t *filter    = c->hLumFilter;
02262     int     canMMX2BeUsed  = c->canMMX2BeUsed;
02263     void    *mmx2FilterCode= c->lumMmx2FilterCode;
02264     int i;
02265 #if defined(PIC)
02266     DECLARE_ALIGNED(8, uint64_t, ebxsave);
02267 #endif
02268     if (canMMX2BeUsed) {
02269         __asm__ volatile(
02270 #if defined(PIC)
02271             "mov               %%"REG_b", %5        \n\t"
02272 #endif
02273             "pxor                  %%mm7, %%mm7     \n\t"
02274             "mov                      %0, %%"REG_c" \n\t"
02275             "mov                      %1, %%"REG_D" \n\t"
02276             "mov                      %2, %%"REG_d" \n\t"
02277             "mov                      %3, %%"REG_b" \n\t"
02278             "xor               %%"REG_a", %%"REG_a" \n\t" // i
02279             PREFETCH"        (%%"REG_c")            \n\t"
02280             PREFETCH"      32(%%"REG_c")            \n\t"
02281             PREFETCH"      64(%%"REG_c")            \n\t"
02282 
02283 #if ARCH_X86_64
02284 
02285 #define CALL_MMX2_FILTER_CODE \
02286             "movl            (%%"REG_b"), %%esi     \n\t"\
02287             "call                    *%4            \n\t"\
02288             "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
02289             "add               %%"REG_S", %%"REG_c" \n\t"\
02290             "add               %%"REG_a", %%"REG_D" \n\t"\
02291             "xor               %%"REG_a", %%"REG_a" \n\t"\
02292 
02293 #else
02294 
02295 #define CALL_MMX2_FILTER_CODE \
02296             "movl (%%"REG_b"), %%esi        \n\t"\
02297             "call         *%4                       \n\t"\
02298             "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
02299             "add               %%"REG_a", %%"REG_D" \n\t"\
02300             "xor               %%"REG_a", %%"REG_a" \n\t"\
02301 
02302 #endif /* ARCH_X86_64 */
02303 
02304             CALL_MMX2_FILTER_CODE
02305             CALL_MMX2_FILTER_CODE
02306             CALL_MMX2_FILTER_CODE
02307             CALL_MMX2_FILTER_CODE
02308             CALL_MMX2_FILTER_CODE
02309             CALL_MMX2_FILTER_CODE
02310             CALL_MMX2_FILTER_CODE
02311             CALL_MMX2_FILTER_CODE
02312 
02313 #if defined(PIC)
02314             "mov                      %5, %%"REG_b" \n\t"
02315 #endif
02316             :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
02317             "m" (mmx2FilterCode)
02318 #if defined(PIC)
02319             ,"m" (ebxsave)
02320 #endif
02321             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02322 #if !defined(PIC)
02323             ,"%"REG_b
02324 #endif
02325         );
02326         for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
02327     } else {
02328 #endif /* COMPILE_TEMPLATE_MMX2 */
02329     x86_reg xInc_shr16 = xInc >> 16;
02330     uint16_t xInc_mask = xInc & 0xffff;
02331     //NO MMX just normal asm ...
02332     __asm__ volatile(
02333         "xor %%"REG_a", %%"REG_a"            \n\t" // i
02334         "xor %%"REG_d", %%"REG_d"            \n\t" // xx
02335         "xorl    %%ecx, %%ecx                \n\t" // xalpha
02336         ASMALIGN(4)
02337         "1:                                  \n\t"
02338         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
02339         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
02340         FAST_BILINEAR_X86
02341         "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
02342         "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
02343         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
02344 
02345         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
02346         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
02347         FAST_BILINEAR_X86
02348         "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
02349         "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
02350         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
02351 
02352 
02353         "add        $2, %%"REG_a"            \n\t"
02354         "cmp        %2, %%"REG_a"            \n\t"
02355         " jb        1b                       \n\t"
02356 
02357 
02358         :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
02359         : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
02360     );
02361 #if COMPILE_TEMPLATE_MMX2
02362     } //if MMX2 can't be used
02363 #endif
02364 #else
02365     int i;
02366     unsigned int xpos=0;
02367     for (i=0;i<dstWidth;i++) {
02368         register unsigned int xx=xpos>>16;
02369         register unsigned int xalpha=(xpos&0xFFFF)>>9;
02370         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
02371         xpos+=xInc;
02372     }
02373 #endif /* ARCH_X86 */
02374 }
02375 
02376       // *** horizontal scale Y line to temp buffer
02377 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
02378                                    const int16_t *hLumFilter,
02379                                    const int16_t *hLumFilterPos, int hLumFilterSize,
02380                                    uint8_t *formatConvBuffer,
02381                                    uint32_t *pal, int isAlpha)
02382 {
02383     void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
02384     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
02385 
02386     src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
02387 
02388     if (toYV12) {
02389         toYV12(formatConvBuffer, src, srcW, pal);
02390         src= formatConvBuffer;
02391     }
02392 
02393     if (!c->hyscale_fast) {
02394         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
02395     } else { // fast bilinear upscale / crap downscale
02396         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
02397     }
02398 
02399     if (convertRange)
02400         convertRange(dst, dstWidth);
02401 }
02402 
02403 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
02404                                         long dstWidth, const uint8_t *src1,
02405                                         const uint8_t *src2, int srcW, int xInc)
02406 {
02407 #if ARCH_X86
02408 #if COMPILE_TEMPLATE_MMX2
02409     int32_t *filterPos = c->hChrFilterPos;
02410     int16_t *filter    = c->hChrFilter;
02411     int     canMMX2BeUsed  = c->canMMX2BeUsed;
02412     void    *mmx2FilterCode= c->chrMmx2FilterCode;
02413     int i;
02414 #if defined(PIC)
02415     DECLARE_ALIGNED(8, uint64_t, ebxsave);
02416 #endif
02417     if (canMMX2BeUsed) {
02418         __asm__ volatile(
02419 #if defined(PIC)
02420             "mov          %%"REG_b", %6         \n\t"
02421 #endif
02422             "pxor             %%mm7, %%mm7      \n\t"
02423             "mov                 %0, %%"REG_c"  \n\t"
02424             "mov                 %1, %%"REG_D"  \n\t"
02425             "mov                 %2, %%"REG_d"  \n\t"
02426             "mov                 %3, %%"REG_b"  \n\t"
02427             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
02428             PREFETCH"   (%%"REG_c")             \n\t"
02429             PREFETCH" 32(%%"REG_c")             \n\t"
02430             PREFETCH" 64(%%"REG_c")             \n\t"
02431 
02432             CALL_MMX2_FILTER_CODE
02433             CALL_MMX2_FILTER_CODE
02434             CALL_MMX2_FILTER_CODE
02435             CALL_MMX2_FILTER_CODE
02436             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
02437             "mov                 %5, %%"REG_c"  \n\t" // src
02438             "mov                 %1, %%"REG_D"  \n\t" // buf1
02439             "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
02440             PREFETCH"   (%%"REG_c")             \n\t"
02441             PREFETCH" 32(%%"REG_c")             \n\t"
02442             PREFETCH" 64(%%"REG_c")             \n\t"
02443 
02444             CALL_MMX2_FILTER_CODE
02445             CALL_MMX2_FILTER_CODE
02446             CALL_MMX2_FILTER_CODE
02447             CALL_MMX2_FILTER_CODE
02448 
02449 #if defined(PIC)
02450             "mov %6, %%"REG_b"    \n\t"
02451 #endif
02452             :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
02453             "m" (mmx2FilterCode), "m" (src2)
02454 #if defined(PIC)
02455             ,"m" (ebxsave)
02456 #endif
02457             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02458 #if !defined(PIC)
02459             ,"%"REG_b
02460 #endif
02461         );
02462         for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
02463             //printf("%d %d %d\n", dstWidth, i, srcW);
02464             dst[i] = src1[srcW-1]*128;
02465             dst[i+VOFW] = src2[srcW-1]*128;
02466         }
02467     } else {
02468 #endif /* COMPILE_TEMPLATE_MMX2 */
02469         x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
02470         uint16_t xInc_mask = xInc & 0xffff;
02471         __asm__ volatile(
02472             "xor %%"REG_a", %%"REG_a"               \n\t" // i
02473             "xor %%"REG_d", %%"REG_d"               \n\t" // xx
02474             "xorl    %%ecx, %%ecx                   \n\t" // xalpha
02475             ASMALIGN(4)
02476             "1:                                     \n\t"
02477             "mov        %0, %%"REG_S"               \n\t"
02478             "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
02479             "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
02480             FAST_BILINEAR_X86
02481             "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
02482 
02483             "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
02484             "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
02485             FAST_BILINEAR_X86
02486             "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
02487 
02488             "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
02489             "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
02490             "add        $1, %%"REG_a"               \n\t"
02491             "cmp        %2, %%"REG_a"               \n\t"
02492             " jb        1b                          \n\t"
02493 
02494 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
02495 which is needed to support GCC 4.0. */
02496 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
02497             :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
02498 #else
02499             :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
02500 #endif
02501             "r" (src2)
02502             : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
02503         );
02504 #if COMPILE_TEMPLATE_MMX2
02505     } //if MMX2 can't be used
02506 #endif
02507 #else
02508     int i;
02509     unsigned int xpos=0;
02510     for (i=0;i<dstWidth;i++) {
02511         register unsigned int xx=xpos>>16;
02512         register unsigned int xalpha=(xpos&0xFFFF)>>9;
02513         dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
02514         dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
02515         /* slower
02516         dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
02517         dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
02518         */
02519         xpos+=xInc;
02520     }
02521 #endif /* ARCH_X86 */
02522 }
02523 
02524 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
02525                                    int srcW, int xInc, const int16_t *hChrFilter,
02526                                    const int16_t *hChrFilterPos, int hChrFilterSize,
02527                                    uint8_t *formatConvBuffer,
02528                                    uint32_t *pal)
02529 {
02530 
02531     src1 += c->chrSrcOffset;
02532     src2 += c->chrSrcOffset;
02533 
02534     if (c->chrToYV12) {
02535         c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02536         src1= formatConvBuffer;
02537         src2= formatConvBuffer+VOFW;
02538     }
02539 
02540     if (!c->hcscale_fast) {
02541         c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02542         c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02543     } else { // fast bilinear upscale / crap downscale
02544         c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
02545     }
02546 
02547     if (c->chrConvertRange)
02548         c->chrConvertRange(dst, dstWidth);
02549 }
02550 
02551 #define DEBUG_SWSCALE_BUFFERS 0
02552 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
02553 
02554 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
02555                            int srcSliceH, uint8_t* dst[], int dstStride[])
02556 {
02557     /* load a few things into local vars to make the code more readable? and faster */
02558     const int srcW= c->srcW;
02559     const int dstW= c->dstW;
02560     const int dstH= c->dstH;
02561     const int chrDstW= c->chrDstW;
02562     const int chrSrcW= c->chrSrcW;
02563     const int lumXInc= c->lumXInc;
02564     const int chrXInc= c->chrXInc;
02565     const enum PixelFormat dstFormat= c->dstFormat;
02566     const int flags= c->flags;
02567     int16_t *vLumFilterPos= c->vLumFilterPos;
02568     int16_t *vChrFilterPos= c->vChrFilterPos;
02569     int16_t *hLumFilterPos= c->hLumFilterPos;
02570     int16_t *hChrFilterPos= c->hChrFilterPos;
02571     int16_t *vLumFilter= c->vLumFilter;
02572     int16_t *vChrFilter= c->vChrFilter;
02573     int16_t *hLumFilter= c->hLumFilter;
02574     int16_t *hChrFilter= c->hChrFilter;
02575     int32_t *lumMmxFilter= c->lumMmxFilter;
02576     int32_t *chrMmxFilter= c->chrMmxFilter;
02577     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
02578     const int vLumFilterSize= c->vLumFilterSize;
02579     const int vChrFilterSize= c->vChrFilterSize;
02580     const int hLumFilterSize= c->hLumFilterSize;
02581     const int hChrFilterSize= c->hChrFilterSize;
02582     int16_t **lumPixBuf= c->lumPixBuf;
02583     int16_t **chrPixBuf= c->chrPixBuf;
02584     int16_t **alpPixBuf= c->alpPixBuf;
02585     const int vLumBufSize= c->vLumBufSize;
02586     const int vChrBufSize= c->vChrBufSize;
02587     uint8_t *formatConvBuffer= c->formatConvBuffer;
02588     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
02589     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
02590     int lastDstY;
02591     uint32_t *pal=c->pal_yuv;
02592 
02593     /* vars which will change and which we need to store back in the context */
02594     int dstY= c->dstY;
02595     int lumBufIndex= c->lumBufIndex;
02596     int chrBufIndex= c->chrBufIndex;
02597     int lastInLumBuf= c->lastInLumBuf;
02598     int lastInChrBuf= c->lastInChrBuf;
02599 
02600     if (isPacked(c->srcFormat)) {
02601         src[0]=
02602         src[1]=
02603         src[2]=
02604         src[3]= src[0];
02605         srcStride[0]=
02606         srcStride[1]=
02607         srcStride[2]=
02608         srcStride[3]= srcStride[0];
02609     }
02610     srcStride[1]<<= c->vChrDrop;
02611     srcStride[2]<<= c->vChrDrop;
02612 
02613     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
02614                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
02615                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
02616     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
02617                    srcSliceY,    srcSliceH,    dstY,    dstH);
02618     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
02619                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
02620 
02621     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
02622         static int warnedAlready=0; //FIXME move this into the context perhaps
02623         if (flags & SWS_PRINT_INFO && !warnedAlready) {
02624             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
02625                    "         ->cannot do aligned memory accesses anymore\n");
02626             warnedAlready=1;
02627         }
02628     }
02629 
02630     /* Note the user might start scaling the picture in the middle so this
02631        will not get executed. This is not really intended but works
02632        currently, so people might do it. */
02633     if (srcSliceY ==0) {
02634         lumBufIndex=-1;
02635         chrBufIndex=-1;
02636         dstY=0;
02637         lastInLumBuf= -1;
02638         lastInChrBuf= -1;
02639     }
02640 
02641     lastDstY= dstY;
02642 
02643     for (;dstY < dstH; dstY++) {
02644         unsigned char *dest =dst[0]+dstStride[0]*dstY;
02645         const int chrDstY= dstY>>c->chrDstVSubSample;
02646         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
02647         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
02648         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
02649 
02650         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
02651         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
02652         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
02653         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
02654         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
02655         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
02656         int enough_lines;
02657 
02658         //handle holes (FAST_BILINEAR & weird filters)
02659         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
02660         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
02661         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
02662         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
02663 
02664         DEBUG_BUFFERS("dstY: %d\n", dstY);
02665         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
02666                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
02667         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
02668                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
02669 
02670         // Do we have enough lines in this slice to output the dstY line
02671         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
02672 
02673         if (!enough_lines) {
02674             lastLumSrcY = srcSliceY + srcSliceH - 1;
02675             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
02676             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
02677                                             lastLumSrcY, lastChrSrcY);
02678         }
02679 
02680         //Do horizontal scaling
02681         while(lastInLumBuf < lastLumSrcY) {
02682             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
02683             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
02684             lumBufIndex++;
02685             assert(lumBufIndex < 2*vLumBufSize);
02686             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
02687             assert(lastInLumBuf + 1 - srcSliceY >= 0);
02688             RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
02689                             hLumFilter, hLumFilterPos, hLumFilterSize,
02690                             formatConvBuffer,
02691                             pal, 0);
02692             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
02693                 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
02694                                 hLumFilter, hLumFilterPos, hLumFilterSize,
02695                                 formatConvBuffer,
02696                                 pal, 1);
02697             lastInLumBuf++;
02698             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
02699                                lumBufIndex,    lastInLumBuf);
02700         }
02701         while(lastInChrBuf < lastChrSrcY) {
02702             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
02703             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
02704             chrBufIndex++;
02705             assert(chrBufIndex < 2*vChrBufSize);
02706             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
02707             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
02708             //FIXME replace parameters through context struct (some at least)
02709 
02710             if (c->needs_hcscale)
02711                 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
02712                                 hChrFilter, hChrFilterPos, hChrFilterSize,
02713                                 formatConvBuffer,
02714                                 pal);
02715             lastInChrBuf++;
02716             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
02717                                chrBufIndex,    lastInChrBuf);
02718         }
02719         //wrap buf index around to stay inside the ring buffer
02720         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
02721         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
02722         if (!enough_lines)
02723             break; //we can't output a dstY line so let's try with the next slice
02724 
02725 #if COMPILE_TEMPLATE_MMX
02726         c->blueDither= ff_dither8[dstY&1];
02727         if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
02728             c->greenDither= ff_dither8[dstY&1];
02729         else
02730             c->greenDither= ff_dither4[dstY&1];
02731         c->redDither= ff_dither8[(dstY+1)&1];
02732 #endif
02733         if (dstY < dstH-2) {
02734             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02735             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02736             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
02737 #if COMPILE_TEMPLATE_MMX
02738             int i;
02739             if (flags & SWS_ACCURATE_RND) {
02740                 int s= APCK_SIZE / 8;
02741                 for (i=0; i<vLumFilterSize; i+=2) {
02742                     *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
02743                     *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
02744                               lumMmxFilter[s*i+APCK_COEF/4  ]=
02745                               lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
02746                         + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
02747                     if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
02748                         *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
02749                         *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
02750                                   alpMmxFilter[s*i+APCK_COEF/4  ]=
02751                                   alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
02752                     }
02753                 }
02754                 for (i=0; i<vChrFilterSize; i+=2) {
02755                     *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
02756                     *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
02757                               chrMmxFilter[s*i+APCK_COEF/4  ]=
02758                               chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
02759                         + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
02760                 }
02761             } else {
02762                 for (i=0; i<vLumFilterSize; i++) {
02763                     lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
02764                     lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
02765                     lumMmxFilter[4*i+2]=
02766                     lumMmxFilter[4*i+3]=
02767                         ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
02768                     if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
02769                         alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
02770                         alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
02771                         alpMmxFilter[4*i+2]=
02772                         alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
02773                     }
02774                 }
02775                 for (i=0; i<vChrFilterSize; i++) {
02776                     chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
02777                     chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
02778                     chrMmxFilter[4*i+2]=
02779                     chrMmxFilter[4*i+3]=
02780                         ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
02781                 }
02782             }
02783 #endif
02784             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
02785                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02786                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
02787                 c->yuv2nv12X(c,
02788                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02789                              vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02790                              dest, uDest, dstW, chrDstW, dstFormat);
02791             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
02792                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02793                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
02794                 if (is16BPS(dstFormat)) {
02795                     yuv2yuvX16inC(
02796                                   vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02797                                   vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02798                                   alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
02799                                   dstFormat);
02800                 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
02801                     const int16_t *lumBuf = lumSrcPtr[0];
02802                     const int16_t *chrBuf= chrSrcPtr[0];
02803                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
02804                     c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
02805                 } else { //General YV12
02806                     c->yuv2yuvX(c,
02807                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02808                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02809                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
02810                 }
02811             } else {
02812                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02813                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02814                 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
02815                     int chrAlpha= vChrFilter[2*dstY+1];
02816                     if(flags & SWS_FULL_CHR_H_INT) {
02817                         yuv2rgbXinC_full(c, //FIXME write a packed1_full function
02818                                          vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02819                                          vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02820                                          alpSrcPtr, dest, dstW, dstY);
02821                     } else {
02822                         c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
02823                                        alpPixBuf ? *alpSrcPtr : NULL,
02824                                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
02825                     }
02826                 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
02827                     int lumAlpha= vLumFilter[2*dstY+1];
02828                     int chrAlpha= vChrFilter[2*dstY+1];
02829                     lumMmxFilter[2]=
02830                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
02831                     chrMmxFilter[2]=
02832                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
02833                     if(flags & SWS_FULL_CHR_H_INT) {
02834                         yuv2rgbXinC_full(c, //FIXME write a packed2_full function
02835                                          vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02836                                          vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02837                                          alpSrcPtr, dest, dstW, dstY);
02838                     } else {
02839                         c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
02840                                        alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
02841                                        dest, dstW, lumAlpha, chrAlpha, dstY);
02842                     }
02843                 } else { //general RGB
02844                     if(flags & SWS_FULL_CHR_H_INT) {
02845                         yuv2rgbXinC_full(c,
02846                                          vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02847                                          vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02848                                          alpSrcPtr, dest, dstW, dstY);
02849                     } else {
02850                         c->yuv2packedX(c,
02851                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02852                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02853                                        alpSrcPtr, dest, dstW, dstY);
02854                     }
02855                 }
02856             }
02857         } else { // hmm looks like we can't use MMX here without overwriting this array's tail
02858             const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02859             const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02860             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
02861             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
02862                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02863                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
02864                 yuv2nv12XinC(
02865                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02866                              vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02867                              dest, uDest, dstW, chrDstW, dstFormat);
02868             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
02869                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02870                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
02871                 if (is16BPS(dstFormat)) {
02872                     yuv2yuvX16inC(
02873                                   vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02874                                   vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02875                                   alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
02876                                   dstFormat);
02877                 } else {
02878                     yuv2yuvXinC(
02879                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02880                                 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02881                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
02882                 }
02883             } else {
02884                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02885                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02886                 if(flags & SWS_FULL_CHR_H_INT) {
02887                     yuv2rgbXinC_full(c,
02888                                      vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02889                                      vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02890                                      alpSrcPtr, dest, dstW, dstY);
02891                 } else {
02892                     yuv2packedXinC(c,
02893                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02894                                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02895                                    alpSrcPtr, dest, dstW, dstY);
02896                 }
02897             }
02898         }
02899     }
02900 
02901     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
02902         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
02903 
02904 #if COMPILE_TEMPLATE_MMX
02905     if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
02906     /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
02907     if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
02908     else                             __asm__ volatile("emms"  :::"memory");
02909 #endif
02910     /* store changed local vars back in the context */
02911     c->dstY= dstY;
02912     c->lumBufIndex= lumBufIndex;
02913     c->chrBufIndex= chrBufIndex;
02914     c->lastInLumBuf= lastInLumBuf;
02915     c->lastInChrBuf= lastInChrBuf;
02916 
02917     return dstY - lastDstY;
02918 }
02919 
02920 static void RENAME(sws_init_swScale)(SwsContext *c)
02921 {
02922     enum PixelFormat srcFormat = c->srcFormat;
02923 
02924     c->yuv2nv12X    = RENAME(yuv2nv12X   );
02925     c->yuv2yuv1     = RENAME(yuv2yuv1    );
02926     c->yuv2yuvX     = RENAME(yuv2yuvX    );
02927     c->yuv2packed1  = RENAME(yuv2packed1 );
02928     c->yuv2packed2  = RENAME(yuv2packed2 );
02929     c->yuv2packedX  = RENAME(yuv2packedX );
02930 
02931     c->hScale       = RENAME(hScale      );
02932 
02933 #if COMPILE_TEMPLATE_MMX
02934     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
02935     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
02936 #else
02937     if (c->flags & SWS_FAST_BILINEAR)
02938 #endif
02939     {
02940         c->hyscale_fast = RENAME(hyscale_fast);
02941         c->hcscale_fast = RENAME(hcscale_fast);
02942     }
02943 
02944     c->chrToYV12 = NULL;
02945     switch(srcFormat) {
02946         case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
02947         case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
02948         case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
02949         case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
02950         case PIX_FMT_RGB8     :
02951         case PIX_FMT_BGR8     :
02952         case PIX_FMT_PAL8     :
02953         case PIX_FMT_BGR4_BYTE:
02954         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
02955         case PIX_FMT_YUV420P16BE:
02956         case PIX_FMT_YUV422P16BE:
02957         case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
02958         case PIX_FMT_YUV420P16LE:
02959         case PIX_FMT_YUV422P16LE:
02960         case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
02961     }
02962     if (c->chrSrcHSubSample) {
02963         switch(srcFormat) {
02964         case PIX_FMT_RGB48BE:
02965         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
02966         case PIX_FMT_RGB32  :
02967         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
02968         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
02969         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
02970         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
02971         case PIX_FMT_BGR32  :
02972         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
02973         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
02974         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
02975         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
02976         }
02977     } else {
02978         switch(srcFormat) {
02979         case PIX_FMT_RGB48BE:
02980         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
02981         case PIX_FMT_RGB32  :
02982         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
02983         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
02984         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
02985         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
02986         case PIX_FMT_BGR32  :
02987         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
02988         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
02989         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
02990         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
02991         }
02992     }
02993 
02994     c->lumToYV12 = NULL;
02995     c->alpToYV12 = NULL;
02996     switch (srcFormat) {
02997     case PIX_FMT_YUYV422  :
02998     case PIX_FMT_YUV420P16BE:
02999     case PIX_FMT_YUV422P16BE:
03000     case PIX_FMT_YUV444P16BE:
03001     case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
03002     case PIX_FMT_UYVY422  :
03003     case PIX_FMT_YUV420P16LE:
03004     case PIX_FMT_YUV422P16LE:
03005     case PIX_FMT_YUV444P16LE:
03006     case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
03007     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
03008     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
03009     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
03010     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
03011     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
03012     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
03013     case PIX_FMT_RGB8     :
03014     case PIX_FMT_BGR8     :
03015     case PIX_FMT_PAL8     :
03016     case PIX_FMT_BGR4_BYTE:
03017     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
03018     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
03019     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
03020     case PIX_FMT_RGB32  :
03021     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
03022     case PIX_FMT_BGR32  :
03023     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
03024     case PIX_FMT_RGB48BE:
03025     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
03026     }
03027     if (c->alpPixBuf) {
03028         switch (srcFormat) {
03029         case PIX_FMT_RGB32  :
03030         case PIX_FMT_RGB32_1:
03031         case PIX_FMT_BGR32  :
03032         case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
03033         }
03034     }
03035 
03036     switch (srcFormat) {
03037     case PIX_FMT_RGB32  :
03038     case PIX_FMT_BGR32  :
03039         c->alpSrcOffset = 3;
03040         break;
03041     case PIX_FMT_RGB32_1:
03042     case PIX_FMT_BGR32_1:
03043         c->lumSrcOffset = ALT32_CORR;
03044         c->chrSrcOffset = ALT32_CORR;
03045         break;
03046     case PIX_FMT_RGB48LE:
03047         c->lumSrcOffset = 1;
03048         c->chrSrcOffset = 1;
03049         c->alpSrcOffset = 1;
03050         break;
03051     }
03052 
03053     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
03054         if (c->srcRange) {
03055             c->lumConvertRange = RENAME(lumRangeFromJpeg);
03056             c->chrConvertRange = RENAME(chrRangeFromJpeg);
03057         } else {
03058             c->lumConvertRange = RENAME(lumRangeToJpeg);
03059             c->chrConvertRange = RENAME(chrRangeToJpeg);
03060         }
03061     }
03062 
03063     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
03064           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
03065         c->needs_hcscale = 1;
03066 }

Generated on Fri Sep 16 2011 17:17:52 for FFmpeg by  doxygen 1.7.1