Libav 0.7.1
|
00001 /* 00002 * XVID MPEG-4 VIDEO CODEC 00003 * - SSE2 inverse discrete cosine transform - 00004 * 00005 * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> 00006 * 00007 * Conversion to gcc syntax with modifications 00008 * by Alexander Strange <astrange@ithinksw.com> 00009 * 00010 * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. 00011 * 00012 * This file is part of Libav. 00013 * 00014 * Vertical pass is an implementation of the scheme: 00015 * Loeffler C., Ligtenberg A., and Moschytz C.S.: 00016 * Practical Fast 1D DCT Algorithm with Eleven Multiplications, 00017 * Proc. ICASSP 1989, 988-991. 00018 * 00019 * Horizontal pass is a double 4x4 vector/matrix multiplication, 00020 * (see also Intel's Application Note 922: 00021 * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 00022 * Copyright (C) 1999 Intel Corporation) 00023 * 00024 * More details at http://skal.planet-d.net/coding/dct.html 00025 * 00026 * Libav is free software; you can redistribute it and/or 00027 * modify it under the terms of the GNU Lesser General Public 00028 * License as published by the Free Software Foundation; either 00029 * version 2.1 of the License, or (at your option) any later version. 00030 * 00031 * Libav is distributed in the hope that it will be useful, 00032 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00033 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00034 * Lesser General Public License for more details. 00035 * 00036 * You should have received a copy of the GNU Lesser General Public License 00037 * along with Libav; if not, write to the Free Software Foundation, 00038 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00039 */ 00040 00041 #include "libavcodec/dsputil.h" 00042 #include "libavutil/x86_cpu.h" 00043 #include "idct_xvid.h" 00044 #include "dsputil_mmx.h" 00045 00051 #define X8(x) x,x,x,x,x,x,x,x 00052 00053 #define ROW_SHIFT 11 00054 #define COL_SHIFT 6 00055 00056 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16) 00057 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 00058 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1 00059 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2) 00060 DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)}; 00061 00062 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = { 00063 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, 00064 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, 00065 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, 00066 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b 00067 }; 00068 00069 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = { 00070 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, 00071 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, 00072 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, 00073 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df 00074 }; 00075 00076 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = { 00077 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, 00078 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, 00079 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, 00080 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 00081 }; 00082 00083 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = { 00084 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, 00085 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, 00086 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, 00087 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e 00088 }; 00089 00090 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { 00091 65536, 65536, 65536, 65536, 00092 3597, 3597, 3597, 3597, 00093 2260, 2260, 2260, 2260, 00094 1203, 1203, 1203, 1203, 00095 120, 120, 120, 120, 00096 512, 512, 512, 512 00097 }; 00098 00099 // Temporary storage before the column pass 00100 #define ROW1 "%%xmm6" 00101 #define ROW3 "%%xmm4" 00102 #define ROW5 "%%xmm5" 00103 #define ROW7 "%%xmm7" 00104 00105 #define CLEAR_ODD(r) "pxor "r","r" \n\t" 00106 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" 00107 00108 #if ARCH_X86_64 00109 00110 # define ROW0 "%%xmm8" 00111 # define REG0 ROW0 00112 # define ROW2 "%%xmm9" 00113 # define REG2 ROW2 00114 # define ROW4 "%%xmm10" 00115 # define REG4 ROW4 00116 # define ROW6 "%%xmm11" 00117 # define REG6 ROW6 00118 # define CLEAR_EVEN(r) CLEAR_ODD(r) 00119 # define PUT_EVEN(dst) PUT_ODD(dst) 00120 # define XMMS "%%xmm12" 00121 # define MOV_32_ONLY "#" 00122 # define SREG2 REG2 00123 # define TAN3 "%%xmm13" 00124 # define TAN1 "%%xmm14" 00125 00126 #else 00127 00128 # define ROW0 "(%0)" 00129 # define REG0 "%%xmm4" 00130 # define ROW2 "2*16(%0)" 00131 # define REG2 "%%xmm4" 00132 # define ROW4 "4*16(%0)" 00133 # define REG4 "%%xmm6" 00134 # define ROW6 "6*16(%0)" 00135 # define REG6 "%%xmm6" 00136 # define CLEAR_EVEN(r) 00137 # define PUT_EVEN(dst) \ 00138 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ 00139 "movdqa %%xmm2, "dst" \n\t" 00140 # define XMMS "%%xmm2" 00141 # define MOV_32_ONLY "movdqa " 00142 # define SREG2 "%%xmm7" 00143 # define TAN3 "%%xmm0" 00144 # define TAN1 "%%xmm2" 00145 00146 #endif 00147 00148 #define ROUND(x) "paddd "MANGLE(x) 00149 00150 #define JZ(reg, to) \ 00151 "testl "reg","reg" \n\t" \ 00152 "jz "to" \n\t" 00153 00154 #define JNZ(reg, to) \ 00155 "testl "reg","reg" \n\t" \ 00156 "jnz "to" \n\t" 00157 00158 #define TEST_ONE_ROW(src, reg, clear) \ 00159 clear \ 00160 "movq "src", %%mm1 \n\t" \ 00161 "por 8+"src", %%mm1 \n\t" \ 00162 "paddusb %%mm0, %%mm1 \n\t" \ 00163 "pmovmskb %%mm1, "reg" \n\t" 00164 00165 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ 00166 clear1 \ 00167 clear2 \ 00168 "movq "row1", %%mm1 \n\t" \ 00169 "por 8+"row1", %%mm1 \n\t" \ 00170 "movq "row2", %%mm2 \n\t" \ 00171 "por 8+"row2", %%mm2 \n\t" \ 00172 "paddusb %%mm0, %%mm1 \n\t" \ 00173 "paddusb %%mm0, %%mm2 \n\t" \ 00174 "pmovmskb %%mm1, "reg1" \n\t" \ 00175 "pmovmskb %%mm2, "reg2" \n\t" 00176 00178 #define iMTX_MULT(src, table, rounder, put) \ 00179 "movdqa "src", %%xmm3 \n\t" \ 00180 "movdqa %%xmm3, %%xmm0 \n\t" \ 00181 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ 00182 "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ 00183 "pmaddwd "table", %%xmm0 \n\t" \ 00184 "pmaddwd 16+"table", %%xmm1 \n\t" \ 00185 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ 00186 "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ 00187 "pmaddwd 32+"table", %%xmm2 \n\t" \ 00188 "pmaddwd 48+"table", %%xmm3 \n\t" \ 00189 "paddd %%xmm1, %%xmm0 \n\t" \ 00190 "paddd %%xmm3, %%xmm2 \n\t" \ 00191 rounder", %%xmm0 \n\t" \ 00192 "movdqa %%xmm2, %%xmm3 \n\t" \ 00193 "paddd %%xmm0, %%xmm2 \n\t" \ 00194 "psubd %%xmm3, %%xmm0 \n\t" \ 00195 "psrad $11, %%xmm2 \n\t" \ 00196 "psrad $11, %%xmm0 \n\t" \ 00197 "packssdw %%xmm0, %%xmm2 \n\t" \ 00198 put \ 00199 "1: \n\t" 00200 00201 #define iLLM_HEAD \ 00202 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ 00203 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ 00204 00205 00206 #define iLLM_PASS(dct) \ 00207 "movdqa "TAN3", %%xmm1 \n\t" \ 00208 "movdqa "TAN1", %%xmm3 \n\t" \ 00209 "pmulhw %%xmm4, "TAN3" \n\t" \ 00210 "pmulhw %%xmm5, %%xmm1 \n\t" \ 00211 "paddsw %%xmm4, "TAN3" \n\t" \ 00212 "paddsw %%xmm5, %%xmm1 \n\t" \ 00213 "psubsw %%xmm5, "TAN3" \n\t" \ 00214 "paddsw %%xmm4, %%xmm1 \n\t" \ 00215 "pmulhw %%xmm7, %%xmm3 \n\t" \ 00216 "pmulhw %%xmm6, "TAN1" \n\t" \ 00217 "paddsw %%xmm6, %%xmm3 \n\t" \ 00218 "psubsw %%xmm7, "TAN1" \n\t" \ 00219 "movdqa %%xmm3, %%xmm7 \n\t" \ 00220 "movdqa "TAN1", %%xmm6 \n\t" \ 00221 "psubsw %%xmm1, %%xmm3 \n\t" \ 00222 "psubsw "TAN3", "TAN1" \n\t" \ 00223 "paddsw %%xmm7, %%xmm1 \n\t" \ 00224 "paddsw %%xmm6, "TAN3" \n\t" \ 00225 "movdqa %%xmm3, %%xmm6 \n\t" \ 00226 "psubsw "TAN3", %%xmm3 \n\t" \ 00227 "paddsw %%xmm6, "TAN3" \n\t" \ 00228 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ 00229 "pmulhw %%xmm4, %%xmm3 \n\t" \ 00230 "pmulhw %%xmm4, "TAN3" \n\t" \ 00231 "paddsw "TAN3", "TAN3" \n\t" \ 00232 "paddsw %%xmm3, %%xmm3 \n\t" \ 00233 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ 00234 MOV_32_ONLY ROW2", "REG2" \n\t" \ 00235 MOV_32_ONLY ROW6", "REG6" \n\t" \ 00236 "movdqa %%xmm7, %%xmm5 \n\t" \ 00237 "pmulhw "REG6", %%xmm7 \n\t" \ 00238 "pmulhw "REG2", %%xmm5 \n\t" \ 00239 "paddsw "REG2", %%xmm7 \n\t" \ 00240 "psubsw "REG6", %%xmm5 \n\t" \ 00241 MOV_32_ONLY ROW0", "REG0" \n\t" \ 00242 MOV_32_ONLY ROW4", "REG4" \n\t" \ 00243 MOV_32_ONLY" "TAN1", (%0) \n\t" \ 00244 "movdqa "REG0", "XMMS" \n\t" \ 00245 "psubsw "REG4", "REG0" \n\t" \ 00246 "paddsw "XMMS", "REG4" \n\t" \ 00247 "movdqa "REG4", "XMMS" \n\t" \ 00248 "psubsw %%xmm7, "REG4" \n\t" \ 00249 "paddsw "XMMS", %%xmm7 \n\t" \ 00250 "movdqa "REG0", "XMMS" \n\t" \ 00251 "psubsw %%xmm5, "REG0" \n\t" \ 00252 "paddsw "XMMS", %%xmm5 \n\t" \ 00253 "movdqa %%xmm5, "XMMS" \n\t" \ 00254 "psubsw "TAN3", %%xmm5 \n\t" \ 00255 "paddsw "XMMS", "TAN3" \n\t" \ 00256 "movdqa "REG0", "XMMS" \n\t" \ 00257 "psubsw %%xmm3, "REG0" \n\t" \ 00258 "paddsw "XMMS", %%xmm3 \n\t" \ 00259 MOV_32_ONLY" (%0), "TAN1" \n\t" \ 00260 "psraw $6, %%xmm5 \n\t" \ 00261 "psraw $6, "REG0" \n\t" \ 00262 "psraw $6, "TAN3" \n\t" \ 00263 "psraw $6, %%xmm3 \n\t" \ 00264 "movdqa "TAN3", 1*16("dct") \n\t" \ 00265 "movdqa %%xmm3, 2*16("dct") \n\t" \ 00266 "movdqa "REG0", 5*16("dct") \n\t" \ 00267 "movdqa %%xmm5, 6*16("dct") \n\t" \ 00268 "movdqa %%xmm7, %%xmm0 \n\t" \ 00269 "movdqa "REG4", %%xmm4 \n\t" \ 00270 "psubsw %%xmm1, %%xmm7 \n\t" \ 00271 "psubsw "TAN1", "REG4" \n\t" \ 00272 "paddsw %%xmm0, %%xmm1 \n\t" \ 00273 "paddsw %%xmm4, "TAN1" \n\t" \ 00274 "psraw $6, %%xmm1 \n\t" \ 00275 "psraw $6, %%xmm7 \n\t" \ 00276 "psraw $6, "TAN1" \n\t" \ 00277 "psraw $6, "REG4" \n\t" \ 00278 "movdqa %%xmm1, ("dct") \n\t" \ 00279 "movdqa "TAN1", 3*16("dct") \n\t" \ 00280 "movdqa "REG4", 4*16("dct") \n\t" \ 00281 "movdqa %%xmm7, 7*16("dct") \n\t" 00282 00284 #define iLLM_PASS_SPARSE(dct) \ 00285 "pmulhw %%xmm4, "TAN3" \n\t" \ 00286 "paddsw %%xmm4, "TAN3" \n\t" \ 00287 "movdqa %%xmm6, %%xmm3 \n\t" \ 00288 "pmulhw %%xmm6, "TAN1" \n\t" \ 00289 "movdqa %%xmm4, %%xmm1 \n\t" \ 00290 "psubsw %%xmm1, %%xmm3 \n\t" \ 00291 "paddsw %%xmm6, %%xmm1 \n\t" \ 00292 "movdqa "TAN1", %%xmm6 \n\t" \ 00293 "psubsw "TAN3", "TAN1" \n\t" \ 00294 "paddsw %%xmm6, "TAN3" \n\t" \ 00295 "movdqa %%xmm3, %%xmm6 \n\t" \ 00296 "psubsw "TAN3", %%xmm3 \n\t" \ 00297 "paddsw %%xmm6, "TAN3" \n\t" \ 00298 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ 00299 "pmulhw %%xmm4, %%xmm3 \n\t" \ 00300 "pmulhw %%xmm4, "TAN3" \n\t" \ 00301 "paddsw "TAN3", "TAN3" \n\t" \ 00302 "paddsw %%xmm3, %%xmm3 \n\t" \ 00303 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ 00304 MOV_32_ONLY ROW2", "SREG2" \n\t" \ 00305 "pmulhw "SREG2", %%xmm5 \n\t" \ 00306 MOV_32_ONLY ROW0", "REG0" \n\t" \ 00307 "movdqa "REG0", %%xmm6 \n\t" \ 00308 "psubsw "SREG2", %%xmm6 \n\t" \ 00309 "paddsw "REG0", "SREG2" \n\t" \ 00310 MOV_32_ONLY" "TAN1", (%0) \n\t" \ 00311 "movdqa "REG0", "XMMS" \n\t" \ 00312 "psubsw %%xmm5, "REG0" \n\t" \ 00313 "paddsw "XMMS", %%xmm5 \n\t" \ 00314 "movdqa %%xmm5, "XMMS" \n\t" \ 00315 "psubsw "TAN3", %%xmm5 \n\t" \ 00316 "paddsw "XMMS", "TAN3" \n\t" \ 00317 "movdqa "REG0", "XMMS" \n\t" \ 00318 "psubsw %%xmm3, "REG0" \n\t" \ 00319 "paddsw "XMMS", %%xmm3 \n\t" \ 00320 MOV_32_ONLY" (%0), "TAN1" \n\t" \ 00321 "psraw $6, %%xmm5 \n\t" \ 00322 "psraw $6, "REG0" \n\t" \ 00323 "psraw $6, "TAN3" \n\t" \ 00324 "psraw $6, %%xmm3 \n\t" \ 00325 "movdqa "TAN3", 1*16("dct") \n\t" \ 00326 "movdqa %%xmm3, 2*16("dct") \n\t" \ 00327 "movdqa "REG0", 5*16("dct") \n\t" \ 00328 "movdqa %%xmm5, 6*16("dct") \n\t" \ 00329 "movdqa "SREG2", %%xmm0 \n\t" \ 00330 "movdqa %%xmm6, %%xmm4 \n\t" \ 00331 "psubsw %%xmm1, "SREG2" \n\t" \ 00332 "psubsw "TAN1", %%xmm6 \n\t" \ 00333 "paddsw %%xmm0, %%xmm1 \n\t" \ 00334 "paddsw %%xmm4, "TAN1" \n\t" \ 00335 "psraw $6, %%xmm1 \n\t" \ 00336 "psraw $6, "SREG2" \n\t" \ 00337 "psraw $6, "TAN1" \n\t" \ 00338 "psraw $6, %%xmm6 \n\t" \ 00339 "movdqa %%xmm1, ("dct") \n\t" \ 00340 "movdqa "TAN1", 3*16("dct") \n\t" \ 00341 "movdqa %%xmm6, 4*16("dct") \n\t" \ 00342 "movdqa "SREG2", 7*16("dct") \n\t" 00343 00344 inline void ff_idct_xvid_sse2(short *block) 00345 { 00346 __asm__ volatile( 00347 "movq "MANGLE(m127)", %%mm0 \n\t" 00348 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) 00349 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) 00350 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) 00351 00352 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) 00353 JZ("%%eax", "1f") 00354 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) 00355 00356 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) 00357 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) 00358 iLLM_HEAD 00359 ".p2align 4 \n\t" 00360 JNZ("%%ecx", "2f") 00361 JNZ("%%eax", "3f") 00362 JNZ("%%edx", "4f") 00363 JNZ("%%esi", "5f") 00364 iLLM_PASS_SPARSE("%0") 00365 "jmp 6f \n\t" 00366 "2: \n\t" 00367 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) 00368 "3: \n\t" 00369 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) 00370 JZ("%%edx", "1f") 00371 "4: \n\t" 00372 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) 00373 JZ("%%esi", "1f") 00374 "5: \n\t" 00375 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) 00376 #if !ARCH_X86_64 00377 iLLM_HEAD 00378 #endif 00379 iLLM_PASS("%0") 00380 "6: \n\t" 00381 : "+r"(block) 00382 : 00383 : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , 00384 "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,) 00385 #if ARCH_X86_64 00386 XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11", 00387 "%xmm12", "%xmm13", "%xmm14",) 00388 #endif 00389 "%eax", "%ecx", "%edx", "%esi", "memory" 00390 ); 00391 } 00392 00393 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) 00394 { 00395 ff_idct_xvid_sse2(block); 00396 ff_put_pixels_clamped_mmx(block, dest, line_size); 00397 } 00398 00399 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) 00400 { 00401 ff_idct_xvid_sse2(block); 00402 ff_add_pixels_clamped_mmx(block, dest, line_size); 00403 }