Libav 0.7.1
libavcodec/x86/idct_sse2_xvid.c
Go to the documentation of this file.
00001 /*
00002  * XVID MPEG-4 VIDEO CODEC
00003  * - SSE2 inverse discrete cosine transform -
00004  *
00005  * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
00006  *
00007  * Conversion to gcc syntax with modifications
00008  * by Alexander Strange <astrange@ithinksw.com>
00009  *
00010  * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
00011  *
00012  * This file is part of Libav.
00013  *
00014  * Vertical pass is an implementation of the scheme:
00015  *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
00016  *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
00017  *  Proc. ICASSP 1989, 988-991.
00018  *
00019  * Horizontal pass is a double 4x4 vector/matrix multiplication,
00020  * (see also Intel's Application Note 922:
00021  *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
00022  *  Copyright (C) 1999 Intel Corporation)
00023  *
00024  * More details at http://skal.planet-d.net/coding/dct.html
00025  *
00026  * Libav is free software; you can redistribute it and/or
00027  * modify it under the terms of the GNU Lesser General Public
00028  * License as published by the Free Software Foundation; either
00029  * version 2.1 of the License, or (at your option) any later version.
00030  *
00031  * Libav is distributed in the hope that it will be useful,
00032  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00033  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00034  * Lesser General Public License for more details.
00035  *
00036  * You should have received a copy of the GNU Lesser General Public License
00037  * along with Libav; if not, write to the Free Software Foundation,
00038  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00039  */
00040 
00041 #include "libavcodec/dsputil.h"
00042 #include "libavutil/x86_cpu.h"
00043 #include "idct_xvid.h"
00044 #include "dsputil_mmx.h"
00045 
00051 #define X8(x)     x,x,x,x,x,x,x,x
00052 
00053 #define ROW_SHIFT 11
00054 #define COL_SHIFT 6
00055 
00056 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
00057 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
00058 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
00059 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
00060 DECLARE_ASM_CONST(8,  uint8_t, m127)[] = {X8(127)};
00061 
00062 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
00063  0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00064  0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00065  0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00066  0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00067 };
00068 
00069 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
00070  0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00071  0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00072  0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00073  0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00074 };
00075 
00076 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
00077  0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00078  0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00079  0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00080  0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00081 };
00082 
00083 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
00084  0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00085  0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00086  0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00087  0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00088 };
00089 
00090 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
00091  65536, 65536, 65536, 65536,
00092   3597,  3597,  3597,  3597,
00093   2260,  2260,  2260,  2260,
00094   1203,  1203,  1203,  1203,
00095    120,   120,   120,   120,
00096    512,   512,   512,   512
00097 };
00098 
00099 // Temporary storage before the column pass
00100 #define ROW1 "%%xmm6"
00101 #define ROW3 "%%xmm4"
00102 #define ROW5 "%%xmm5"
00103 #define ROW7 "%%xmm7"
00104 
00105 #define CLEAR_ODD(r) "pxor  "r","r" \n\t"
00106 #define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
00107 
00108 #if ARCH_X86_64
00109 
00110 # define ROW0 "%%xmm8"
00111 # define REG0 ROW0
00112 # define ROW2 "%%xmm9"
00113 # define REG2 ROW2
00114 # define ROW4 "%%xmm10"
00115 # define REG4 ROW4
00116 # define ROW6 "%%xmm11"
00117 # define REG6 ROW6
00118 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00119 # define PUT_EVEN(dst) PUT_ODD(dst)
00120 # define XMMS "%%xmm12"
00121 # define MOV_32_ONLY "#"
00122 # define SREG2 REG2
00123 # define TAN3 "%%xmm13"
00124 # define TAN1 "%%xmm14"
00125 
00126 #else
00127 
00128 # define ROW0 "(%0)"
00129 # define REG0 "%%xmm4"
00130 # define ROW2 "2*16(%0)"
00131 # define REG2 "%%xmm4"
00132 # define ROW4 "4*16(%0)"
00133 # define REG4 "%%xmm6"
00134 # define ROW6 "6*16(%0)"
00135 # define REG6 "%%xmm6"
00136 # define CLEAR_EVEN(r)
00137 # define PUT_EVEN(dst) \
00138     "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
00139     "movdqa          %%xmm2, "dst"    \n\t"
00140 # define XMMS "%%xmm2"
00141 # define MOV_32_ONLY "movdqa "
00142 # define SREG2 "%%xmm7"
00143 # define TAN3 "%%xmm0"
00144 # define TAN1 "%%xmm2"
00145 
00146 #endif
00147 
00148 #define ROUND(x) "paddd   "MANGLE(x)
00149 
00150 #define JZ(reg, to)                         \
00151     "testl     "reg","reg"            \n\t" \
00152     "jz        "to"                   \n\t"
00153 
00154 #define JNZ(reg, to)                        \
00155     "testl     "reg","reg"            \n\t" \
00156     "jnz       "to"                   \n\t"
00157 
00158 #define TEST_ONE_ROW(src, reg, clear)       \
00159     clear                                   \
00160     "movq     "src", %%mm1            \n\t" \
00161     "por    8+"src", %%mm1            \n\t" \
00162     "paddusb  %%mm0, %%mm1            \n\t" \
00163     "pmovmskb %%mm1, "reg"            \n\t"
00164 
00165 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00166     clear1                                  \
00167     clear2                                  \
00168     "movq     "row1", %%mm1           \n\t" \
00169     "por    8+"row1", %%mm1           \n\t" \
00170     "movq     "row2", %%mm2           \n\t" \
00171     "por    8+"row2", %%mm2           \n\t" \
00172     "paddusb   %%mm0, %%mm1           \n\t" \
00173     "paddusb   %%mm0, %%mm2           \n\t" \
00174     "pmovmskb  %%mm1, "reg1"          \n\t" \
00175     "pmovmskb  %%mm2, "reg2"          \n\t"
00176 
00178 #define iMTX_MULT(src, table, rounder, put) \
00179     "movdqa        "src", %%xmm3      \n\t" \
00180     "movdqa       %%xmm3, %%xmm0      \n\t" \
00181     "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
00182     "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
00183     "pmaddwd     "table", %%xmm0      \n\t" \
00184     "pmaddwd  16+"table", %%xmm1      \n\t" \
00185     "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
00186     "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
00187     "pmaddwd  32+"table", %%xmm2      \n\t" \
00188     "pmaddwd  48+"table", %%xmm3      \n\t" \
00189     "paddd        %%xmm1, %%xmm0      \n\t" \
00190     "paddd        %%xmm3, %%xmm2      \n\t" \
00191     rounder",     %%xmm0              \n\t" \
00192     "movdqa       %%xmm2, %%xmm3      \n\t" \
00193     "paddd        %%xmm0, %%xmm2      \n\t" \
00194     "psubd        %%xmm3, %%xmm0      \n\t" \
00195     "psrad           $11, %%xmm2      \n\t" \
00196     "psrad           $11, %%xmm0      \n\t" \
00197     "packssdw     %%xmm0, %%xmm2      \n\t" \
00198     put                                     \
00199     "1:                               \n\t"
00200 
00201 #define iLLM_HEAD                           \
00202     "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
00203     "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
00204 
00205 
00206 #define iLLM_PASS(dct)                      \
00207     "movdqa   "TAN3", %%xmm1          \n\t" \
00208     "movdqa   "TAN1", %%xmm3          \n\t" \
00209     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00210     "pmulhw   %%xmm5, %%xmm1          \n\t" \
00211     "paddsw   %%xmm4, "TAN3"          \n\t" \
00212     "paddsw   %%xmm5, %%xmm1          \n\t" \
00213     "psubsw   %%xmm5, "TAN3"          \n\t" \
00214     "paddsw   %%xmm4, %%xmm1          \n\t" \
00215     "pmulhw   %%xmm7, %%xmm3          \n\t" \
00216     "pmulhw   %%xmm6, "TAN1"          \n\t" \
00217     "paddsw   %%xmm6, %%xmm3          \n\t" \
00218     "psubsw   %%xmm7, "TAN1"          \n\t" \
00219     "movdqa   %%xmm3, %%xmm7          \n\t" \
00220     "movdqa   "TAN1", %%xmm6          \n\t" \
00221     "psubsw   %%xmm1, %%xmm3          \n\t" \
00222     "psubsw   "TAN3", "TAN1"          \n\t" \
00223     "paddsw   %%xmm7, %%xmm1          \n\t" \
00224     "paddsw   %%xmm6, "TAN3"          \n\t" \
00225     "movdqa   %%xmm3, %%xmm6          \n\t" \
00226     "psubsw   "TAN3", %%xmm3          \n\t" \
00227     "paddsw   %%xmm6, "TAN3"          \n\t" \
00228     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
00229     "pmulhw   %%xmm4, %%xmm3          \n\t" \
00230     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00231     "paddsw   "TAN3", "TAN3"          \n\t" \
00232     "paddsw   %%xmm3, %%xmm3          \n\t" \
00233     "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
00234     MOV_32_ONLY ROW2", "REG2"         \n\t" \
00235     MOV_32_ONLY ROW6", "REG6"         \n\t" \
00236     "movdqa   %%xmm7, %%xmm5          \n\t" \
00237     "pmulhw   "REG6", %%xmm7          \n\t" \
00238     "pmulhw   "REG2", %%xmm5          \n\t" \
00239     "paddsw   "REG2", %%xmm7          \n\t" \
00240     "psubsw   "REG6", %%xmm5          \n\t" \
00241     MOV_32_ONLY ROW0", "REG0"         \n\t" \
00242     MOV_32_ONLY ROW4", "REG4"         \n\t" \
00243     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
00244     "movdqa   "REG0", "XMMS"          \n\t" \
00245     "psubsw   "REG4", "REG0"          \n\t" \
00246     "paddsw   "XMMS", "REG4"          \n\t" \
00247     "movdqa   "REG4", "XMMS"          \n\t" \
00248     "psubsw   %%xmm7, "REG4"          \n\t" \
00249     "paddsw   "XMMS", %%xmm7          \n\t" \
00250     "movdqa   "REG0", "XMMS"          \n\t" \
00251     "psubsw   %%xmm5, "REG0"          \n\t" \
00252     "paddsw   "XMMS", %%xmm5          \n\t" \
00253     "movdqa   %%xmm5, "XMMS"          \n\t" \
00254     "psubsw   "TAN3", %%xmm5          \n\t" \
00255     "paddsw   "XMMS", "TAN3"          \n\t" \
00256     "movdqa   "REG0", "XMMS"          \n\t" \
00257     "psubsw   %%xmm3, "REG0"          \n\t" \
00258     "paddsw   "XMMS", %%xmm3          \n\t" \
00259     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
00260     "psraw        $6, %%xmm5          \n\t" \
00261     "psraw        $6, "REG0"          \n\t" \
00262     "psraw        $6, "TAN3"          \n\t" \
00263     "psraw        $6, %%xmm3          \n\t" \
00264     "movdqa   "TAN3", 1*16("dct")     \n\t" \
00265     "movdqa   %%xmm3, 2*16("dct")     \n\t" \
00266     "movdqa   "REG0", 5*16("dct")     \n\t" \
00267     "movdqa   %%xmm5, 6*16("dct")     \n\t" \
00268     "movdqa   %%xmm7, %%xmm0          \n\t" \
00269     "movdqa   "REG4", %%xmm4          \n\t" \
00270     "psubsw   %%xmm1, %%xmm7          \n\t" \
00271     "psubsw   "TAN1", "REG4"          \n\t" \
00272     "paddsw   %%xmm0, %%xmm1          \n\t" \
00273     "paddsw   %%xmm4, "TAN1"          \n\t" \
00274     "psraw        $6, %%xmm1          \n\t" \
00275     "psraw        $6, %%xmm7          \n\t" \
00276     "psraw        $6, "TAN1"          \n\t" \
00277     "psraw        $6, "REG4"          \n\t" \
00278     "movdqa   %%xmm1, ("dct")         \n\t" \
00279     "movdqa   "TAN1", 3*16("dct")     \n\t" \
00280     "movdqa   "REG4", 4*16("dct")     \n\t" \
00281     "movdqa   %%xmm7, 7*16("dct")     \n\t"
00282 
00284 #define iLLM_PASS_SPARSE(dct)               \
00285     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00286     "paddsw   %%xmm4, "TAN3"          \n\t" \
00287     "movdqa   %%xmm6, %%xmm3          \n\t" \
00288     "pmulhw   %%xmm6, "TAN1"          \n\t" \
00289     "movdqa   %%xmm4, %%xmm1          \n\t" \
00290     "psubsw   %%xmm1, %%xmm3          \n\t" \
00291     "paddsw   %%xmm6, %%xmm1          \n\t" \
00292     "movdqa   "TAN1", %%xmm6          \n\t" \
00293     "psubsw   "TAN3", "TAN1"          \n\t" \
00294     "paddsw   %%xmm6, "TAN3"          \n\t" \
00295     "movdqa   %%xmm3, %%xmm6          \n\t" \
00296     "psubsw   "TAN3", %%xmm3          \n\t" \
00297     "paddsw   %%xmm6, "TAN3"          \n\t" \
00298     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
00299     "pmulhw   %%xmm4, %%xmm3          \n\t" \
00300     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00301     "paddsw   "TAN3", "TAN3"          \n\t" \
00302     "paddsw   %%xmm3, %%xmm3          \n\t" \
00303     "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
00304     MOV_32_ONLY ROW2", "SREG2"        \n\t" \
00305     "pmulhw   "SREG2", %%xmm5         \n\t" \
00306     MOV_32_ONLY ROW0", "REG0"         \n\t" \
00307     "movdqa   "REG0", %%xmm6          \n\t" \
00308     "psubsw   "SREG2", %%xmm6         \n\t" \
00309     "paddsw   "REG0", "SREG2"         \n\t" \
00310     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
00311     "movdqa   "REG0", "XMMS"          \n\t" \
00312     "psubsw   %%xmm5, "REG0"          \n\t" \
00313     "paddsw   "XMMS", %%xmm5          \n\t" \
00314     "movdqa   %%xmm5, "XMMS"          \n\t" \
00315     "psubsw   "TAN3", %%xmm5          \n\t" \
00316     "paddsw   "XMMS", "TAN3"          \n\t" \
00317     "movdqa   "REG0", "XMMS"          \n\t" \
00318     "psubsw   %%xmm3, "REG0"          \n\t" \
00319     "paddsw   "XMMS", %%xmm3          \n\t" \
00320     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
00321     "psraw        $6, %%xmm5          \n\t" \
00322     "psraw        $6, "REG0"          \n\t" \
00323     "psraw        $6, "TAN3"          \n\t" \
00324     "psraw        $6, %%xmm3          \n\t" \
00325     "movdqa   "TAN3", 1*16("dct")     \n\t" \
00326     "movdqa   %%xmm3, 2*16("dct")     \n\t" \
00327     "movdqa   "REG0", 5*16("dct")     \n\t" \
00328     "movdqa   %%xmm5, 6*16("dct")     \n\t" \
00329     "movdqa   "SREG2", %%xmm0         \n\t" \
00330     "movdqa   %%xmm6, %%xmm4          \n\t" \
00331     "psubsw   %%xmm1, "SREG2"         \n\t" \
00332     "psubsw   "TAN1", %%xmm6          \n\t" \
00333     "paddsw   %%xmm0, %%xmm1          \n\t" \
00334     "paddsw   %%xmm4, "TAN1"          \n\t" \
00335     "psraw        $6, %%xmm1          \n\t" \
00336     "psraw        $6, "SREG2"         \n\t" \
00337     "psraw        $6, "TAN1"          \n\t" \
00338     "psraw        $6, %%xmm6          \n\t" \
00339     "movdqa   %%xmm1, ("dct")         \n\t" \
00340     "movdqa   "TAN1", 3*16("dct")     \n\t" \
00341     "movdqa   %%xmm6, 4*16("dct")     \n\t" \
00342     "movdqa   "SREG2", 7*16("dct")    \n\t"
00343 
00344 inline void ff_idct_xvid_sse2(short *block)
00345 {
00346     __asm__ volatile(
00347     "movq     "MANGLE(m127)", %%mm0                              \n\t"
00348     iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
00349     iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00350     iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00351 
00352     TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00353     JZ("%%eax", "1f")
00354     iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00355 
00356     TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00357     TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00358     iLLM_HEAD
00359     ".p2align 4 \n\t"
00360     JNZ("%%ecx", "2f")
00361     JNZ("%%eax", "3f")
00362     JNZ("%%edx", "4f")
00363     JNZ("%%esi", "5f")
00364     iLLM_PASS_SPARSE("%0")
00365     "jmp 6f                                                      \n\t"
00366     "2:                                                          \n\t"
00367     iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00368     "3:                                                          \n\t"
00369     iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00370     JZ("%%edx", "1f")
00371     "4:                                                          \n\t"
00372     iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00373     JZ("%%esi", "1f")
00374     "5:                                                          \n\t"
00375     iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00376 #if !ARCH_X86_64
00377     iLLM_HEAD
00378 #endif
00379     iLLM_PASS("%0")
00380     "6:                                                          \n\t"
00381     : "+r"(block)
00382     :
00383     : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
00384                    "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
00385 #if ARCH_X86_64
00386       XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
00387                    "%xmm12", "%xmm13", "%xmm14",)
00388 #endif
00389       "%eax", "%ecx", "%edx", "%esi", "memory"
00390     );
00391 }
00392 
00393 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00394 {
00395     ff_idct_xvid_sse2(block);
00396     ff_put_pixels_clamped_mmx(block, dest, line_size);
00397 }
00398 
00399 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00400 {
00401     ff_idct_xvid_sse2(block);
00402     ff_add_pixels_clamped_mmx(block, dest, line_size);
00403 }