Libav
|
00001 /* 00002 * XVID MPEG-4 VIDEO CODEC 00003 * - SSE2 inverse discrete cosine transform - 00004 * 00005 * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> 00006 * 00007 * Conversion to gcc syntax with modifications 00008 * by Alexander Strange <astrange@ithinksw.com> 00009 * 00010 * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. 00011 * 00012 * This file is part of FFmpeg. 00013 * 00014 * Vertical pass is an implementation of the scheme: 00015 * Loeffler C., Ligtenberg A., and Moschytz C.S.: 00016 * Practical Fast 1D DCT Algorithm with Eleven Multiplications, 00017 * Proc. ICASSP 1989, 988-991. 00018 * 00019 * Horizontal pass is a double 4x4 vector/matrix multiplication, 00020 * (see also Intel's Application Note 922: 00021 * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 00022 * Copyright (C) 1999 Intel Corporation) 00023 * 00024 * More details at http://skal.planet-d.net/coding/dct.html 00025 * 00026 * FFmpeg is free software; you can redistribute it and/or 00027 * modify it under the terms of the GNU Lesser General Public 00028 * License as published by the Free Software Foundation; either 00029 * version 2.1 of the License, or (at your option) any later version. 00030 * 00031 * FFmpeg is distributed in the hope that it will be useful, 00032 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00033 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00034 * Lesser General Public License for more details. 00035 * 00036 * You should have received a copy of the GNU Lesser General Public License 00037 * along with FFmpeg; if not, write to the Free Software Foundation, 00038 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00039 */ 00040 00041 #include "libavcodec/dsputil.h" 00042 #include "idct_xvid.h" 00043 #include "dsputil_mmx.h" 00044 00050 #define X8(x) x,x,x,x,x,x,x,x 00051 00052 #define ROW_SHIFT 11 00053 #define COL_SHIFT 6 00054 00055 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16) 00056 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 00057 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1 00058 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2) 00059 DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)}; 00060 00061 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = { 00062 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, 00063 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, 00064 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, 00065 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b 00066 }; 00067 00068 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = { 00069 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, 00070 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, 00071 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, 00072 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df 00073 }; 00074 00075 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = { 00076 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, 00077 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, 00078 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, 00079 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 00080 }; 00081 00082 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = { 00083 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, 00084 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, 00085 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, 00086 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e 00087 }; 00088 00089 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { 00090 65536, 65536, 65536, 65536, 00091 3597, 3597, 3597, 3597, 00092 2260, 2260, 2260, 2260, 00093 1203, 1203, 1203, 1203, 00094 120, 120, 120, 120, 00095 512, 512, 512, 512 00096 }; 00097 00098 // Temporary storage before the column pass 00099 #define ROW1 "%%xmm6" 00100 #define ROW3 "%%xmm4" 00101 #define ROW5 "%%xmm5" 00102 #define ROW7 "%%xmm7" 00103 00104 #define CLEAR_ODD(r) "pxor "r","r" \n\t" 00105 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" 00106 00107 #if ARCH_X86_64 00108 00109 # define ROW0 "%%xmm8" 00110 # define REG0 ROW0 00111 # define ROW2 "%%xmm9" 00112 # define REG2 ROW2 00113 # define ROW4 "%%xmm10" 00114 # define REG4 ROW4 00115 # define ROW6 "%%xmm11" 00116 # define REG6 ROW6 00117 # define CLEAR_EVEN(r) CLEAR_ODD(r) 00118 # define PUT_EVEN(dst) PUT_ODD(dst) 00119 # define XMMS "%%xmm12" 00120 # define MOV_32_ONLY "#" 00121 # define SREG2 REG2 00122 # define TAN3 "%%xmm13" 00123 # define TAN1 "%%xmm14" 00124 00125 #else 00126 00127 # define ROW0 "(%0)" 00128 # define REG0 "%%xmm4" 00129 # define ROW2 "2*16(%0)" 00130 # define REG2 "%%xmm4" 00131 # define ROW4 "4*16(%0)" 00132 # define REG4 "%%xmm6" 00133 # define ROW6 "6*16(%0)" 00134 # define REG6 "%%xmm6" 00135 # define CLEAR_EVEN(r) 00136 # define PUT_EVEN(dst) \ 00137 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ 00138 "movdqa %%xmm2, "dst" \n\t" 00139 # define XMMS "%%xmm2" 00140 # define MOV_32_ONLY "movdqa " 00141 # define SREG2 "%%xmm7" 00142 # define TAN3 "%%xmm0" 00143 # define TAN1 "%%xmm2" 00144 00145 #endif 00146 00147 #define ROUND(x) "paddd "MANGLE(x) 00148 00149 #define JZ(reg, to) \ 00150 "testl "reg","reg" \n\t" \ 00151 "jz "to" \n\t" 00152 00153 #define JNZ(reg, to) \ 00154 "testl "reg","reg" \n\t" \ 00155 "jnz "to" \n\t" 00156 00157 #define TEST_ONE_ROW(src, reg, clear) \ 00158 clear \ 00159 "movq "src", %%mm1 \n\t" \ 00160 "por 8+"src", %%mm1 \n\t" \ 00161 "paddusb %%mm0, %%mm1 \n\t" \ 00162 "pmovmskb %%mm1, "reg" \n\t" 00163 00164 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ 00165 clear1 \ 00166 clear2 \ 00167 "movq "row1", %%mm1 \n\t" \ 00168 "por 8+"row1", %%mm1 \n\t" \ 00169 "movq "row2", %%mm2 \n\t" \ 00170 "por 8+"row2", %%mm2 \n\t" \ 00171 "paddusb %%mm0, %%mm1 \n\t" \ 00172 "paddusb %%mm0, %%mm2 \n\t" \ 00173 "pmovmskb %%mm1, "reg1" \n\t" \ 00174 "pmovmskb %%mm2, "reg2" \n\t" 00175 00177 #define iMTX_MULT(src, table, rounder, put) \ 00178 "movdqa "src", %%xmm3 \n\t" \ 00179 "movdqa %%xmm3, %%xmm0 \n\t" \ 00180 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ 00181 "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ 00182 "pmaddwd "table", %%xmm0 \n\t" \ 00183 "pmaddwd 16+"table", %%xmm1 \n\t" \ 00184 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ 00185 "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ 00186 "pmaddwd 32+"table", %%xmm2 \n\t" \ 00187 "pmaddwd 48+"table", %%xmm3 \n\t" \ 00188 "paddd %%xmm1, %%xmm0 \n\t" \ 00189 "paddd %%xmm3, %%xmm2 \n\t" \ 00190 rounder", %%xmm0 \n\t" \ 00191 "movdqa %%xmm2, %%xmm3 \n\t" \ 00192 "paddd %%xmm0, %%xmm2 \n\t" \ 00193 "psubd %%xmm3, %%xmm0 \n\t" \ 00194 "psrad $11, %%xmm2 \n\t" \ 00195 "psrad $11, %%xmm0 \n\t" \ 00196 "packssdw %%xmm0, %%xmm2 \n\t" \ 00197 put \ 00198 "1: \n\t" 00199 00200 #define iLLM_HEAD \ 00201 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ 00202 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ 00203 00204 00205 #define iLLM_PASS(dct) \ 00206 "movdqa "TAN3", %%xmm1 \n\t" \ 00207 "movdqa "TAN1", %%xmm3 \n\t" \ 00208 "pmulhw %%xmm4, "TAN3" \n\t" \ 00209 "pmulhw %%xmm5, %%xmm1 \n\t" \ 00210 "paddsw %%xmm4, "TAN3" \n\t" \ 00211 "paddsw %%xmm5, %%xmm1 \n\t" \ 00212 "psubsw %%xmm5, "TAN3" \n\t" \ 00213 "paddsw %%xmm4, %%xmm1 \n\t" \ 00214 "pmulhw %%xmm7, %%xmm3 \n\t" \ 00215 "pmulhw %%xmm6, "TAN1" \n\t" \ 00216 "paddsw %%xmm6, %%xmm3 \n\t" \ 00217 "psubsw %%xmm7, "TAN1" \n\t" \ 00218 "movdqa %%xmm3, %%xmm7 \n\t" \ 00219 "movdqa "TAN1", %%xmm6 \n\t" \ 00220 "psubsw %%xmm1, %%xmm3 \n\t" \ 00221 "psubsw "TAN3", "TAN1" \n\t" \ 00222 "paddsw %%xmm7, %%xmm1 \n\t" \ 00223 "paddsw %%xmm6, "TAN3" \n\t" \ 00224 "movdqa %%xmm3, %%xmm6 \n\t" \ 00225 "psubsw "TAN3", %%xmm3 \n\t" \ 00226 "paddsw %%xmm6, "TAN3" \n\t" \ 00227 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ 00228 "pmulhw %%xmm4, %%xmm3 \n\t" \ 00229 "pmulhw %%xmm4, "TAN3" \n\t" \ 00230 "paddsw "TAN3", "TAN3" \n\t" \ 00231 "paddsw %%xmm3, %%xmm3 \n\t" \ 00232 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ 00233 MOV_32_ONLY ROW2", "REG2" \n\t" \ 00234 MOV_32_ONLY ROW6", "REG6" \n\t" \ 00235 "movdqa %%xmm7, %%xmm5 \n\t" \ 00236 "pmulhw "REG6", %%xmm7 \n\t" \ 00237 "pmulhw "REG2", %%xmm5 \n\t" \ 00238 "paddsw "REG2", %%xmm7 \n\t" \ 00239 "psubsw "REG6", %%xmm5 \n\t" \ 00240 MOV_32_ONLY ROW0", "REG0" \n\t" \ 00241 MOV_32_ONLY ROW4", "REG4" \n\t" \ 00242 MOV_32_ONLY" "TAN1", (%0) \n\t" \ 00243 "movdqa "REG0", "XMMS" \n\t" \ 00244 "psubsw "REG4", "REG0" \n\t" \ 00245 "paddsw "XMMS", "REG4" \n\t" \ 00246 "movdqa "REG4", "XMMS" \n\t" \ 00247 "psubsw %%xmm7, "REG4" \n\t" \ 00248 "paddsw "XMMS", %%xmm7 \n\t" \ 00249 "movdqa "REG0", "XMMS" \n\t" \ 00250 "psubsw %%xmm5, "REG0" \n\t" \ 00251 "paddsw "XMMS", %%xmm5 \n\t" \ 00252 "movdqa %%xmm5, "XMMS" \n\t" \ 00253 "psubsw "TAN3", %%xmm5 \n\t" \ 00254 "paddsw "XMMS", "TAN3" \n\t" \ 00255 "movdqa "REG0", "XMMS" \n\t" \ 00256 "psubsw %%xmm3, "REG0" \n\t" \ 00257 "paddsw "XMMS", %%xmm3 \n\t" \ 00258 MOV_32_ONLY" (%0), "TAN1" \n\t" \ 00259 "psraw $6, %%xmm5 \n\t" \ 00260 "psraw $6, "REG0" \n\t" \ 00261 "psraw $6, "TAN3" \n\t" \ 00262 "psraw $6, %%xmm3 \n\t" \ 00263 "movdqa "TAN3", 1*16("dct") \n\t" \ 00264 "movdqa %%xmm3, 2*16("dct") \n\t" \ 00265 "movdqa "REG0", 5*16("dct") \n\t" \ 00266 "movdqa %%xmm5, 6*16("dct") \n\t" \ 00267 "movdqa %%xmm7, %%xmm0 \n\t" \ 00268 "movdqa "REG4", %%xmm4 \n\t" \ 00269 "psubsw %%xmm1, %%xmm7 \n\t" \ 00270 "psubsw "TAN1", "REG4" \n\t" \ 00271 "paddsw %%xmm0, %%xmm1 \n\t" \ 00272 "paddsw %%xmm4, "TAN1" \n\t" \ 00273 "psraw $6, %%xmm1 \n\t" \ 00274 "psraw $6, %%xmm7 \n\t" \ 00275 "psraw $6, "TAN1" \n\t" \ 00276 "psraw $6, "REG4" \n\t" \ 00277 "movdqa %%xmm1, ("dct") \n\t" \ 00278 "movdqa "TAN1", 3*16("dct") \n\t" \ 00279 "movdqa "REG4", 4*16("dct") \n\t" \ 00280 "movdqa %%xmm7, 7*16("dct") \n\t" 00281 00283 #define iLLM_PASS_SPARSE(dct) \ 00284 "pmulhw %%xmm4, "TAN3" \n\t" \ 00285 "paddsw %%xmm4, "TAN3" \n\t" \ 00286 "movdqa %%xmm6, %%xmm3 \n\t" \ 00287 "pmulhw %%xmm6, "TAN1" \n\t" \ 00288 "movdqa %%xmm4, %%xmm1 \n\t" \ 00289 "psubsw %%xmm1, %%xmm3 \n\t" \ 00290 "paddsw %%xmm6, %%xmm1 \n\t" \ 00291 "movdqa "TAN1", %%xmm6 \n\t" \ 00292 "psubsw "TAN3", "TAN1" \n\t" \ 00293 "paddsw %%xmm6, "TAN3" \n\t" \ 00294 "movdqa %%xmm3, %%xmm6 \n\t" \ 00295 "psubsw "TAN3", %%xmm3 \n\t" \ 00296 "paddsw %%xmm6, "TAN3" \n\t" \ 00297 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ 00298 "pmulhw %%xmm4, %%xmm3 \n\t" \ 00299 "pmulhw %%xmm4, "TAN3" \n\t" \ 00300 "paddsw "TAN3", "TAN3" \n\t" \ 00301 "paddsw %%xmm3, %%xmm3 \n\t" \ 00302 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ 00303 MOV_32_ONLY ROW2", "SREG2" \n\t" \ 00304 "pmulhw "SREG2", %%xmm5 \n\t" \ 00305 MOV_32_ONLY ROW0", "REG0" \n\t" \ 00306 "movdqa "REG0", %%xmm6 \n\t" \ 00307 "psubsw "SREG2", %%xmm6 \n\t" \ 00308 "paddsw "REG0", "SREG2" \n\t" \ 00309 MOV_32_ONLY" "TAN1", (%0) \n\t" \ 00310 "movdqa "REG0", "XMMS" \n\t" \ 00311 "psubsw %%xmm5, "REG0" \n\t" \ 00312 "paddsw "XMMS", %%xmm5 \n\t" \ 00313 "movdqa %%xmm5, "XMMS" \n\t" \ 00314 "psubsw "TAN3", %%xmm5 \n\t" \ 00315 "paddsw "XMMS", "TAN3" \n\t" \ 00316 "movdqa "REG0", "XMMS" \n\t" \ 00317 "psubsw %%xmm3, "REG0" \n\t" \ 00318 "paddsw "XMMS", %%xmm3 \n\t" \ 00319 MOV_32_ONLY" (%0), "TAN1" \n\t" \ 00320 "psraw $6, %%xmm5 \n\t" \ 00321 "psraw $6, "REG0" \n\t" \ 00322 "psraw $6, "TAN3" \n\t" \ 00323 "psraw $6, %%xmm3 \n\t" \ 00324 "movdqa "TAN3", 1*16("dct") \n\t" \ 00325 "movdqa %%xmm3, 2*16("dct") \n\t" \ 00326 "movdqa "REG0", 5*16("dct") \n\t" \ 00327 "movdqa %%xmm5, 6*16("dct") \n\t" \ 00328 "movdqa "SREG2", %%xmm0 \n\t" \ 00329 "movdqa %%xmm6, %%xmm4 \n\t" \ 00330 "psubsw %%xmm1, "SREG2" \n\t" \ 00331 "psubsw "TAN1", %%xmm6 \n\t" \ 00332 "paddsw %%xmm0, %%xmm1 \n\t" \ 00333 "paddsw %%xmm4, "TAN1" \n\t" \ 00334 "psraw $6, %%xmm1 \n\t" \ 00335 "psraw $6, "SREG2" \n\t" \ 00336 "psraw $6, "TAN1" \n\t" \ 00337 "psraw $6, %%xmm6 \n\t" \ 00338 "movdqa %%xmm1, ("dct") \n\t" \ 00339 "movdqa "TAN1", 3*16("dct") \n\t" \ 00340 "movdqa %%xmm6, 4*16("dct") \n\t" \ 00341 "movdqa "SREG2", 7*16("dct") \n\t" 00342 00343 inline void ff_idct_xvid_sse2(short *block) 00344 { 00345 __asm__ volatile( 00346 "movq "MANGLE(m127)", %%mm0 \n\t" 00347 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) 00348 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) 00349 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) 00350 00351 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) 00352 JZ("%%eax", "1f") 00353 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) 00354 00355 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) 00356 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) 00357 iLLM_HEAD 00358 ASMALIGN(4) 00359 JNZ("%%ecx", "2f") 00360 JNZ("%%eax", "3f") 00361 JNZ("%%edx", "4f") 00362 JNZ("%%esi", "5f") 00363 iLLM_PASS_SPARSE("%0") 00364 "jmp 6f \n\t" 00365 "2: \n\t" 00366 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) 00367 "3: \n\t" 00368 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) 00369 JZ("%%edx", "1f") 00370 "4: \n\t" 00371 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) 00372 JZ("%%esi", "1f") 00373 "5: \n\t" 00374 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) 00375 #if !ARCH_X86_64 00376 iLLM_HEAD 00377 #endif 00378 iLLM_PASS("%0") 00379 "6: \n\t" 00380 : "+r"(block) 00381 : 00382 : "%eax", "%ecx", "%edx", "%esi", "memory"); 00383 } 00384 00385 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) 00386 { 00387 ff_idct_xvid_sse2(block); 00388 put_pixels_clamped_mmx(block, dest, line_size); 00389 } 00390 00391 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) 00392 { 00393 ff_idct_xvid_sse2(block); 00394 add_pixels_clamped_mmx(block, dest, line_size); 00395 }