• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/idct_sse2_xvid.c

Go to the documentation of this file.
00001 /*
00002  * XVID MPEG-4 VIDEO CODEC
00003  * - SSE2 inverse discrete cosine transform -
00004  *
00005  * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
00006  *
00007  * Conversion to gcc syntax with modifications
00008  * by Alexander Strange <astrange@ithinksw.com>
00009  *
00010  * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
00011  *
00012  * This file is part of FFmpeg.
00013  *
00014  * Vertical pass is an implementation of the scheme:
00015  *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
00016  *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
00017  *  Proc. ICASSP 1989, 988-991.
00018  *
00019  * Horizontal pass is a double 4x4 vector/matrix multiplication,
00020  * (see also Intel's Application Note 922:
00021  *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
00022  *  Copyright (C) 1999 Intel Corporation)
00023  *
00024  * More details at http://skal.planet-d.net/coding/dct.html
00025  *
00026  * FFmpeg is free software; you can redistribute it and/or
00027  * modify it under the terms of the GNU Lesser General Public
00028  * License as published by the Free Software Foundation; either
00029  * version 2.1 of the License, or (at your option) any later version.
00030  *
00031  * FFmpeg is distributed in the hope that it will be useful,
00032  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00033  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00034  * Lesser General Public License for more details.
00035  *
00036  * You should have received a copy of the GNU Lesser General Public License
00037  * along with FFmpeg; if not, write to the Free Software Foundation,
00038  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00039  */
00040 
00041 #include "libavcodec/dsputil.h"
00042 #include "idct_xvid.h"
00043 #include "dsputil_mmx.h"
00044 
00050 #define X8(x)     x,x,x,x,x,x,x,x
00051 
00052 #define ROW_SHIFT 11
00053 #define COL_SHIFT 6
00054 
00055 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
00056 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
00057 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
00058 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
00059 DECLARE_ASM_CONST(8,  uint8_t, m127)[] = {X8(127)};
00060 
00061 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
00062  0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00063  0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00064  0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00065  0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00066 };
00067 
00068 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
00069  0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00070  0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00071  0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00072  0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00073 };
00074 
00075 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
00076  0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00077  0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00078  0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00079  0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00080 };
00081 
00082 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
00083  0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00084  0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00085  0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00086  0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00087 };
00088 
00089 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
00090  65536, 65536, 65536, 65536,
00091   3597,  3597,  3597,  3597,
00092   2260,  2260,  2260,  2260,
00093   1203,  1203,  1203,  1203,
00094    120,   120,   120,   120,
00095    512,   512,   512,   512
00096 };
00097 
00098 // Temporary storage before the column pass
00099 #define ROW1 "%%xmm6"
00100 #define ROW3 "%%xmm4"
00101 #define ROW5 "%%xmm5"
00102 #define ROW7 "%%xmm7"
00103 
00104 #define CLEAR_ODD(r) "pxor  "r","r" \n\t"
00105 #define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
00106 
00107 #if ARCH_X86_64
00108 
00109 # define ROW0 "%%xmm8"
00110 # define REG0 ROW0
00111 # define ROW2 "%%xmm9"
00112 # define REG2 ROW2
00113 # define ROW4 "%%xmm10"
00114 # define REG4 ROW4
00115 # define ROW6 "%%xmm11"
00116 # define REG6 ROW6
00117 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00118 # define PUT_EVEN(dst) PUT_ODD(dst)
00119 # define XMMS "%%xmm12"
00120 # define MOV_32_ONLY "#"
00121 # define SREG2 REG2
00122 # define TAN3 "%%xmm13"
00123 # define TAN1 "%%xmm14"
00124 
00125 #else
00126 
00127 # define ROW0 "(%0)"
00128 # define REG0 "%%xmm4"
00129 # define ROW2 "2*16(%0)"
00130 # define REG2 "%%xmm4"
00131 # define ROW4 "4*16(%0)"
00132 # define REG4 "%%xmm6"
00133 # define ROW6 "6*16(%0)"
00134 # define REG6 "%%xmm6"
00135 # define CLEAR_EVEN(r)
00136 # define PUT_EVEN(dst) \
00137     "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
00138     "movdqa          %%xmm2, "dst"    \n\t"
00139 # define XMMS "%%xmm2"
00140 # define MOV_32_ONLY "movdqa "
00141 # define SREG2 "%%xmm7"
00142 # define TAN3 "%%xmm0"
00143 # define TAN1 "%%xmm2"
00144 
00145 #endif
00146 
00147 #define ROUND(x) "paddd   "MANGLE(x)
00148 
00149 #define JZ(reg, to)                         \
00150     "testl     "reg","reg"            \n\t" \
00151     "jz        "to"                   \n\t"
00152 
00153 #define JNZ(reg, to)                        \
00154     "testl     "reg","reg"            \n\t" \
00155     "jnz       "to"                   \n\t"
00156 
00157 #define TEST_ONE_ROW(src, reg, clear)       \
00158     clear                                   \
00159     "movq     "src", %%mm1            \n\t" \
00160     "por    8+"src", %%mm1            \n\t" \
00161     "paddusb  %%mm0, %%mm1            \n\t" \
00162     "pmovmskb %%mm1, "reg"            \n\t"
00163 
00164 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00165     clear1                                  \
00166     clear2                                  \
00167     "movq     "row1", %%mm1           \n\t" \
00168     "por    8+"row1", %%mm1           \n\t" \
00169     "movq     "row2", %%mm2           \n\t" \
00170     "por    8+"row2", %%mm2           \n\t" \
00171     "paddusb   %%mm0, %%mm1           \n\t" \
00172     "paddusb   %%mm0, %%mm2           \n\t" \
00173     "pmovmskb  %%mm1, "reg1"          \n\t" \
00174     "pmovmskb  %%mm2, "reg2"          \n\t"
00175 
00177 #define iMTX_MULT(src, table, rounder, put) \
00178     "movdqa        "src", %%xmm3      \n\t" \
00179     "movdqa       %%xmm3, %%xmm0      \n\t" \
00180     "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
00181     "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
00182     "pmaddwd     "table", %%xmm0      \n\t" \
00183     "pmaddwd  16+"table", %%xmm1      \n\t" \
00184     "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
00185     "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
00186     "pmaddwd  32+"table", %%xmm2      \n\t" \
00187     "pmaddwd  48+"table", %%xmm3      \n\t" \
00188     "paddd        %%xmm1, %%xmm0      \n\t" \
00189     "paddd        %%xmm3, %%xmm2      \n\t" \
00190     rounder",     %%xmm0              \n\t" \
00191     "movdqa       %%xmm2, %%xmm3      \n\t" \
00192     "paddd        %%xmm0, %%xmm2      \n\t" \
00193     "psubd        %%xmm3, %%xmm0      \n\t" \
00194     "psrad           $11, %%xmm2      \n\t" \
00195     "psrad           $11, %%xmm0      \n\t" \
00196     "packssdw     %%xmm0, %%xmm2      \n\t" \
00197     put                                     \
00198     "1:                               \n\t"
00199 
00200 #define iLLM_HEAD                           \
00201     "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
00202     "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
00203 
00204 
00205 #define iLLM_PASS(dct)                      \
00206     "movdqa   "TAN3", %%xmm1          \n\t" \
00207     "movdqa   "TAN1", %%xmm3          \n\t" \
00208     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00209     "pmulhw   %%xmm5, %%xmm1          \n\t" \
00210     "paddsw   %%xmm4, "TAN3"          \n\t" \
00211     "paddsw   %%xmm5, %%xmm1          \n\t" \
00212     "psubsw   %%xmm5, "TAN3"          \n\t" \
00213     "paddsw   %%xmm4, %%xmm1          \n\t" \
00214     "pmulhw   %%xmm7, %%xmm3          \n\t" \
00215     "pmulhw   %%xmm6, "TAN1"          \n\t" \
00216     "paddsw   %%xmm6, %%xmm3          \n\t" \
00217     "psubsw   %%xmm7, "TAN1"          \n\t" \
00218     "movdqa   %%xmm3, %%xmm7          \n\t" \
00219     "movdqa   "TAN1", %%xmm6          \n\t" \
00220     "psubsw   %%xmm1, %%xmm3          \n\t" \
00221     "psubsw   "TAN3", "TAN1"          \n\t" \
00222     "paddsw   %%xmm7, %%xmm1          \n\t" \
00223     "paddsw   %%xmm6, "TAN3"          \n\t" \
00224     "movdqa   %%xmm3, %%xmm6          \n\t" \
00225     "psubsw   "TAN3", %%xmm3          \n\t" \
00226     "paddsw   %%xmm6, "TAN3"          \n\t" \
00227     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
00228     "pmulhw   %%xmm4, %%xmm3          \n\t" \
00229     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00230     "paddsw   "TAN3", "TAN3"          \n\t" \
00231     "paddsw   %%xmm3, %%xmm3          \n\t" \
00232     "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
00233     MOV_32_ONLY ROW2", "REG2"         \n\t" \
00234     MOV_32_ONLY ROW6", "REG6"         \n\t" \
00235     "movdqa   %%xmm7, %%xmm5          \n\t" \
00236     "pmulhw   "REG6", %%xmm7          \n\t" \
00237     "pmulhw   "REG2", %%xmm5          \n\t" \
00238     "paddsw   "REG2", %%xmm7          \n\t" \
00239     "psubsw   "REG6", %%xmm5          \n\t" \
00240     MOV_32_ONLY ROW0", "REG0"         \n\t" \
00241     MOV_32_ONLY ROW4", "REG4"         \n\t" \
00242     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
00243     "movdqa   "REG0", "XMMS"          \n\t" \
00244     "psubsw   "REG4", "REG0"          \n\t" \
00245     "paddsw   "XMMS", "REG4"          \n\t" \
00246     "movdqa   "REG4", "XMMS"          \n\t" \
00247     "psubsw   %%xmm7, "REG4"          \n\t" \
00248     "paddsw   "XMMS", %%xmm7          \n\t" \
00249     "movdqa   "REG0", "XMMS"          \n\t" \
00250     "psubsw   %%xmm5, "REG0"          \n\t" \
00251     "paddsw   "XMMS", %%xmm5          \n\t" \
00252     "movdqa   %%xmm5, "XMMS"          \n\t" \
00253     "psubsw   "TAN3", %%xmm5          \n\t" \
00254     "paddsw   "XMMS", "TAN3"          \n\t" \
00255     "movdqa   "REG0", "XMMS"          \n\t" \
00256     "psubsw   %%xmm3, "REG0"          \n\t" \
00257     "paddsw   "XMMS", %%xmm3          \n\t" \
00258     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
00259     "psraw        $6, %%xmm5          \n\t" \
00260     "psraw        $6, "REG0"          \n\t" \
00261     "psraw        $6, "TAN3"          \n\t" \
00262     "psraw        $6, %%xmm3          \n\t" \
00263     "movdqa   "TAN3", 1*16("dct")     \n\t" \
00264     "movdqa   %%xmm3, 2*16("dct")     \n\t" \
00265     "movdqa   "REG0", 5*16("dct")     \n\t" \
00266     "movdqa   %%xmm5, 6*16("dct")     \n\t" \
00267     "movdqa   %%xmm7, %%xmm0          \n\t" \
00268     "movdqa   "REG4", %%xmm4          \n\t" \
00269     "psubsw   %%xmm1, %%xmm7          \n\t" \
00270     "psubsw   "TAN1", "REG4"          \n\t" \
00271     "paddsw   %%xmm0, %%xmm1          \n\t" \
00272     "paddsw   %%xmm4, "TAN1"          \n\t" \
00273     "psraw        $6, %%xmm1          \n\t" \
00274     "psraw        $6, %%xmm7          \n\t" \
00275     "psraw        $6, "TAN1"          \n\t" \
00276     "psraw        $6, "REG4"          \n\t" \
00277     "movdqa   %%xmm1, ("dct")         \n\t" \
00278     "movdqa   "TAN1", 3*16("dct")     \n\t" \
00279     "movdqa   "REG4", 4*16("dct")     \n\t" \
00280     "movdqa   %%xmm7, 7*16("dct")     \n\t"
00281 
00283 #define iLLM_PASS_SPARSE(dct)               \
00284     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00285     "paddsw   %%xmm4, "TAN3"          \n\t" \
00286     "movdqa   %%xmm6, %%xmm3          \n\t" \
00287     "pmulhw   %%xmm6, "TAN1"          \n\t" \
00288     "movdqa   %%xmm4, %%xmm1          \n\t" \
00289     "psubsw   %%xmm1, %%xmm3          \n\t" \
00290     "paddsw   %%xmm6, %%xmm1          \n\t" \
00291     "movdqa   "TAN1", %%xmm6          \n\t" \
00292     "psubsw   "TAN3", "TAN1"          \n\t" \
00293     "paddsw   %%xmm6, "TAN3"          \n\t" \
00294     "movdqa   %%xmm3, %%xmm6          \n\t" \
00295     "psubsw   "TAN3", %%xmm3          \n\t" \
00296     "paddsw   %%xmm6, "TAN3"          \n\t" \
00297     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
00298     "pmulhw   %%xmm4, %%xmm3          \n\t" \
00299     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00300     "paddsw   "TAN3", "TAN3"          \n\t" \
00301     "paddsw   %%xmm3, %%xmm3          \n\t" \
00302     "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
00303     MOV_32_ONLY ROW2", "SREG2"        \n\t" \
00304     "pmulhw   "SREG2", %%xmm5         \n\t" \
00305     MOV_32_ONLY ROW0", "REG0"         \n\t" \
00306     "movdqa   "REG0", %%xmm6          \n\t" \
00307     "psubsw   "SREG2", %%xmm6         \n\t" \
00308     "paddsw   "REG0", "SREG2"         \n\t" \
00309     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
00310     "movdqa   "REG0", "XMMS"          \n\t" \
00311     "psubsw   %%xmm5, "REG0"          \n\t" \
00312     "paddsw   "XMMS", %%xmm5          \n\t" \
00313     "movdqa   %%xmm5, "XMMS"          \n\t" \
00314     "psubsw   "TAN3", %%xmm5          \n\t" \
00315     "paddsw   "XMMS", "TAN3"          \n\t" \
00316     "movdqa   "REG0", "XMMS"          \n\t" \
00317     "psubsw   %%xmm3, "REG0"          \n\t" \
00318     "paddsw   "XMMS", %%xmm3          \n\t" \
00319     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
00320     "psraw        $6, %%xmm5          \n\t" \
00321     "psraw        $6, "REG0"          \n\t" \
00322     "psraw        $6, "TAN3"          \n\t" \
00323     "psraw        $6, %%xmm3          \n\t" \
00324     "movdqa   "TAN3", 1*16("dct")     \n\t" \
00325     "movdqa   %%xmm3, 2*16("dct")     \n\t" \
00326     "movdqa   "REG0", 5*16("dct")     \n\t" \
00327     "movdqa   %%xmm5, 6*16("dct")     \n\t" \
00328     "movdqa   "SREG2", %%xmm0         \n\t" \
00329     "movdqa   %%xmm6, %%xmm4          \n\t" \
00330     "psubsw   %%xmm1, "SREG2"         \n\t" \
00331     "psubsw   "TAN1", %%xmm6          \n\t" \
00332     "paddsw   %%xmm0, %%xmm1          \n\t" \
00333     "paddsw   %%xmm4, "TAN1"          \n\t" \
00334     "psraw        $6, %%xmm1          \n\t" \
00335     "psraw        $6, "SREG2"         \n\t" \
00336     "psraw        $6, "TAN1"          \n\t" \
00337     "psraw        $6, %%xmm6          \n\t" \
00338     "movdqa   %%xmm1, ("dct")         \n\t" \
00339     "movdqa   "TAN1", 3*16("dct")     \n\t" \
00340     "movdqa   %%xmm6, 4*16("dct")     \n\t" \
00341     "movdqa   "SREG2", 7*16("dct")    \n\t"
00342 
00343 inline void ff_idct_xvid_sse2(short *block)
00344 {
00345     __asm__ volatile(
00346     "movq     "MANGLE(m127)", %%mm0                              \n\t"
00347     iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
00348     iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00349     iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00350 
00351     TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00352     JZ("%%eax", "1f")
00353     iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00354 
00355     TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00356     TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00357     iLLM_HEAD
00358     ASMALIGN(4)
00359     JNZ("%%ecx", "2f")
00360     JNZ("%%eax", "3f")
00361     JNZ("%%edx", "4f")
00362     JNZ("%%esi", "5f")
00363     iLLM_PASS_SPARSE("%0")
00364     "jmp 6f                                                      \n\t"
00365     "2:                                                          \n\t"
00366     iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00367     "3:                                                          \n\t"
00368     iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00369     JZ("%%edx", "1f")
00370     "4:                                                          \n\t"
00371     iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00372     JZ("%%esi", "1f")
00373     "5:                                                          \n\t"
00374     iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00375 #if !ARCH_X86_64
00376     iLLM_HEAD
00377 #endif
00378     iLLM_PASS("%0")
00379     "6:                                                          \n\t"
00380     : "+r"(block)
00381     :
00382     : "%eax", "%ecx", "%edx", "%esi", "memory");
00383 }
00384 
00385 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00386 {
00387     ff_idct_xvid_sse2(block);
00388     put_pixels_clamped_mmx(block, dest, line_size);
00389 }
00390 
00391 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00392 {
00393     ff_idct_xvid_sse2(block);
00394     add_pixels_clamped_mmx(block, dest, line_size);
00395 }

Generated on Fri Sep 16 2011 17:17:46 for FFmpeg by  doxygen 1.7.1