Libav
|
00001 /* 00002 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 00003 * Loren Merritt 00004 * 00005 * This file is part of FFmpeg. 00006 * 00007 * FFmpeg is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * FFmpeg is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with FFmpeg; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00028 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 00029 { 00030 DECLARE_ALIGNED(8, uint64_t, AA); 00031 DECLARE_ALIGNED(8, uint64_t, DD); 00032 int i; 00033 00034 if(y==0 && x==0) { 00035 /* no filter needed */ 00036 H264_CHROMA_MC8_MV0(dst, src, stride, h); 00037 return; 00038 } 00039 00040 assert(x<8 && y<8 && x>=0 && y>=0); 00041 00042 if(y==0 || x==0) 00043 { 00044 /* 1 dimensional filter only */ 00045 const int dxy = x ? 1 : stride; 00046 00047 __asm__ volatile( 00048 "movd %0, %%mm5\n\t" 00049 "movq %1, %%mm4\n\t" 00050 "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ 00051 "punpcklwd %%mm5, %%mm5\n\t" 00052 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 00053 "pxor %%mm7, %%mm7\n\t" 00054 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ 00055 :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); 00056 00057 for(i=0; i<h; i++) { 00058 __asm__ volatile( 00059 /* mm0 = src[0..7], mm1 = src[1..8] */ 00060 "movq %0, %%mm0\n\t" 00061 "movq %1, %%mm2\n\t" 00062 :: "m"(src[0]), "m"(src[dxy])); 00063 00064 __asm__ volatile( 00065 /* [mm0,mm1] = A * src[0..7] */ 00066 /* [mm2,mm3] = B * src[1..8] */ 00067 "movq %%mm0, %%mm1\n\t" 00068 "movq %%mm2, %%mm3\n\t" 00069 "punpcklbw %%mm7, %%mm0\n\t" 00070 "punpckhbw %%mm7, %%mm1\n\t" 00071 "punpcklbw %%mm7, %%mm2\n\t" 00072 "punpckhbw %%mm7, %%mm3\n\t" 00073 "pmullw %%mm4, %%mm0\n\t" 00074 "pmullw %%mm4, %%mm1\n\t" 00075 "pmullw %%mm5, %%mm2\n\t" 00076 "pmullw %%mm5, %%mm3\n\t" 00077 00078 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */ 00079 "paddw %%mm6, %%mm0\n\t" 00080 "paddw %%mm6, %%mm1\n\t" 00081 "paddw %%mm2, %%mm0\n\t" 00082 "paddw %%mm3, %%mm1\n\t" 00083 "psrlw $3, %%mm0\n\t" 00084 "psrlw $3, %%mm1\n\t" 00085 "packuswb %%mm1, %%mm0\n\t" 00086 H264_CHROMA_OP(%0, %%mm0) 00087 "movq %%mm0, %0\n\t" 00088 : "=m" (dst[0])); 00089 00090 src += stride; 00091 dst += stride; 00092 } 00093 return; 00094 } 00095 00096 /* general case, bilinear */ 00097 __asm__ volatile("movd %2, %%mm4\n\t" 00098 "movd %3, %%mm6\n\t" 00099 "punpcklwd %%mm4, %%mm4\n\t" 00100 "punpcklwd %%mm6, %%mm6\n\t" 00101 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 00102 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ 00103 "movq %%mm4, %%mm5\n\t" 00104 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ 00105 "psllw $3, %%mm5\n\t" 00106 "psllw $3, %%mm6\n\t" 00107 "movq %%mm5, %%mm7\n\t" 00108 "paddw %%mm6, %%mm7\n\t" 00109 "movq %%mm4, %1\n\t" /* DD = x * y */ 00110 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ 00111 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ 00112 "paddw %4, %%mm4\n\t" 00113 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ 00114 "pxor %%mm7, %%mm7\n\t" 00115 "movq %%mm4, %0\n\t" 00116 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); 00117 00118 __asm__ volatile( 00119 /* mm0 = src[0..7], mm1 = src[1..8] */ 00120 "movq %0, %%mm0\n\t" 00121 "movq %1, %%mm1\n\t" 00122 : : "m" (src[0]), "m" (src[1])); 00123 00124 for(i=0; i<h; i++) { 00125 src += stride; 00126 00127 __asm__ volatile( 00128 /* mm2 = A * src[0..3] + B * src[1..4] */ 00129 /* mm3 = A * src[4..7] + B * src[5..8] */ 00130 "movq %%mm0, %%mm2\n\t" 00131 "movq %%mm1, %%mm3\n\t" 00132 "punpckhbw %%mm7, %%mm0\n\t" 00133 "punpcklbw %%mm7, %%mm1\n\t" 00134 "punpcklbw %%mm7, %%mm2\n\t" 00135 "punpckhbw %%mm7, %%mm3\n\t" 00136 "pmullw %0, %%mm0\n\t" 00137 "pmullw %0, %%mm2\n\t" 00138 "pmullw %%mm5, %%mm1\n\t" 00139 "pmullw %%mm5, %%mm3\n\t" 00140 "paddw %%mm1, %%mm2\n\t" 00141 "paddw %%mm0, %%mm3\n\t" 00142 : : "m" (AA)); 00143 00144 __asm__ volatile( 00145 /* [mm2,mm3] += C * src[0..7] */ 00146 "movq %0, %%mm0\n\t" 00147 "movq %%mm0, %%mm1\n\t" 00148 "punpcklbw %%mm7, %%mm0\n\t" 00149 "punpckhbw %%mm7, %%mm1\n\t" 00150 "pmullw %%mm6, %%mm0\n\t" 00151 "pmullw %%mm6, %%mm1\n\t" 00152 "paddw %%mm0, %%mm2\n\t" 00153 "paddw %%mm1, %%mm3\n\t" 00154 : : "m" (src[0])); 00155 00156 __asm__ volatile( 00157 /* [mm2,mm3] += D * src[1..8] */ 00158 "movq %1, %%mm1\n\t" 00159 "movq %%mm1, %%mm0\n\t" 00160 "movq %%mm1, %%mm4\n\t" 00161 "punpcklbw %%mm7, %%mm0\n\t" 00162 "punpckhbw %%mm7, %%mm4\n\t" 00163 "pmullw %2, %%mm0\n\t" 00164 "pmullw %2, %%mm4\n\t" 00165 "paddw %%mm0, %%mm2\n\t" 00166 "paddw %%mm4, %%mm3\n\t" 00167 "movq %0, %%mm0\n\t" 00168 : : "m" (src[0]), "m" (src[1]), "m" (DD)); 00169 00170 __asm__ volatile( 00171 /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */ 00172 "paddw %1, %%mm2\n\t" 00173 "paddw %1, %%mm3\n\t" 00174 "psrlw $6, %%mm2\n\t" 00175 "psrlw $6, %%mm3\n\t" 00176 "packuswb %%mm3, %%mm2\n\t" 00177 H264_CHROMA_OP(%0, %%mm2) 00178 "movq %%mm2, %0\n\t" 00179 : "=m" (dst[0]) : "m" (*rnd_reg)); 00180 dst+= stride; 00181 } 00182 } 00183 00184 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 00185 { 00186 __asm__ volatile( 00187 "pxor %%mm7, %%mm7 \n\t" 00188 "movd %5, %%mm2 \n\t" 00189 "movd %6, %%mm3 \n\t" 00190 "movq "MANGLE(ff_pw_8)", %%mm4\n\t" 00191 "movq "MANGLE(ff_pw_8)", %%mm5\n\t" 00192 "punpcklwd %%mm2, %%mm2 \n\t" 00193 "punpcklwd %%mm3, %%mm3 \n\t" 00194 "punpcklwd %%mm2, %%mm2 \n\t" 00195 "punpcklwd %%mm3, %%mm3 \n\t" 00196 "psubw %%mm2, %%mm4 \n\t" 00197 "psubw %%mm3, %%mm5 \n\t" 00198 00199 "movd (%1), %%mm0 \n\t" 00200 "movd 1(%1), %%mm6 \n\t" 00201 "add %3, %1 \n\t" 00202 "punpcklbw %%mm7, %%mm0 \n\t" 00203 "punpcklbw %%mm7, %%mm6 \n\t" 00204 "pmullw %%mm4, %%mm0 \n\t" 00205 "pmullw %%mm2, %%mm6 \n\t" 00206 "paddw %%mm0, %%mm6 \n\t" 00207 00208 "1: \n\t" 00209 "movd (%1), %%mm0 \n\t" 00210 "movd 1(%1), %%mm1 \n\t" 00211 "add %3, %1 \n\t" 00212 "punpcklbw %%mm7, %%mm0 \n\t" 00213 "punpcklbw %%mm7, %%mm1 \n\t" 00214 "pmullw %%mm4, %%mm0 \n\t" 00215 "pmullw %%mm2, %%mm1 \n\t" 00216 "paddw %%mm0, %%mm1 \n\t" 00217 "movq %%mm1, %%mm0 \n\t" 00218 "pmullw %%mm5, %%mm6 \n\t" 00219 "pmullw %%mm3, %%mm1 \n\t" 00220 "paddw %4, %%mm6 \n\t" 00221 "paddw %%mm6, %%mm1 \n\t" 00222 "psrlw $6, %%mm1 \n\t" 00223 "packuswb %%mm1, %%mm1 \n\t" 00224 H264_CHROMA_OP4((%0), %%mm1, %%mm6) 00225 "movd %%mm1, (%0) \n\t" 00226 "add %3, %0 \n\t" 00227 "movd (%1), %%mm6 \n\t" 00228 "movd 1(%1), %%mm1 \n\t" 00229 "add %3, %1 \n\t" 00230 "punpcklbw %%mm7, %%mm6 \n\t" 00231 "punpcklbw %%mm7, %%mm1 \n\t" 00232 "pmullw %%mm4, %%mm6 \n\t" 00233 "pmullw %%mm2, %%mm1 \n\t" 00234 "paddw %%mm6, %%mm1 \n\t" 00235 "movq %%mm1, %%mm6 \n\t" 00236 "pmullw %%mm5, %%mm0 \n\t" 00237 "pmullw %%mm3, %%mm1 \n\t" 00238 "paddw %4, %%mm0 \n\t" 00239 "paddw %%mm0, %%mm1 \n\t" 00240 "psrlw $6, %%mm1 \n\t" 00241 "packuswb %%mm1, %%mm1 \n\t" 00242 H264_CHROMA_OP4((%0), %%mm1, %%mm0) 00243 "movd %%mm1, (%0) \n\t" 00244 "add %3, %0 \n\t" 00245 "sub $2, %2 \n\t" 00246 "jnz 1b \n\t" 00247 : "+r"(dst), "+r"(src), "+r"(h) 00248 : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) 00249 ); 00250 } 00251 00252 #ifdef H264_CHROMA_MC2_TMPL 00253 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 00254 { 00255 int tmp = ((1<<16)-1)*x + 8; 00256 int CD= tmp*y; 00257 int AB= (tmp<<3) - CD; 00258 __asm__ volatile( 00259 /* mm5 = {A,B,A,B} */ 00260 /* mm6 = {C,D,C,D} */ 00261 "movd %0, %%mm5\n\t" 00262 "movd %1, %%mm6\n\t" 00263 "punpckldq %%mm5, %%mm5\n\t" 00264 "punpckldq %%mm6, %%mm6\n\t" 00265 "pxor %%mm7, %%mm7\n\t" 00266 /* mm0 = src[0,1,1,2] */ 00267 "movd %2, %%mm2\n\t" 00268 "punpcklbw %%mm7, %%mm2\n\t" 00269 "pshufw $0x94, %%mm2, %%mm2\n\t" 00270 :: "r"(AB), "r"(CD), "m"(src[0])); 00271 00272 00273 __asm__ volatile( 00274 "1:\n\t" 00275 "add %4, %1\n\t" 00276 /* mm1 = A * src[0,1] + B * src[1,2] */ 00277 "movq %%mm2, %%mm1\n\t" 00278 "pmaddwd %%mm5, %%mm1\n\t" 00279 /* mm0 = src[0,1,1,2] */ 00280 "movd (%1), %%mm0\n\t" 00281 "punpcklbw %%mm7, %%mm0\n\t" 00282 "pshufw $0x94, %%mm0, %%mm0\n\t" 00283 /* mm1 += C * src[0,1] + D * src[1,2] */ 00284 "movq %%mm0, %%mm2\n\t" 00285 "pmaddwd %%mm6, %%mm0\n\t" 00286 "paddw %3, %%mm1\n\t" 00287 "paddw %%mm0, %%mm1\n\t" 00288 /* dst[0,1] = pack((mm1 + 32) >> 6) */ 00289 "psrlw $6, %%mm1\n\t" 00290 "packssdw %%mm7, %%mm1\n\t" 00291 "packuswb %%mm7, %%mm1\n\t" 00292 H264_CHROMA_OP4((%0), %%mm1, %%mm3) 00293 "movd %%mm1, %%esi\n\t" 00294 "movw %%si, (%0)\n\t" 00295 "add %4, %0\n\t" 00296 "sub $1, %2\n\t" 00297 "jnz 1b\n\t" 00298 : "+r" (dst), "+r"(src), "+r"(h) 00299 : "m" (ff_pw_32), "r"((x86_reg)stride) 00300 : "%esi"); 00301 00302 } 00303 #endif 00304