Libav
|
00001 /* 00002 * DSP utils : average functions are compiled twice for 3dnow/mmx2 00003 * Copyright (c) 2000, 2001 Fabrice Bellard 00004 * Copyright (c) 2002-2004 Michael Niedermayer 00005 * 00006 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 00007 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> 00008 * and improved by Zdenek Kabelac <kabi@users.sf.net> 00009 * 00010 * This file is part of FFmpeg. 00011 * 00012 * FFmpeg is free software; you can redistribute it and/or 00013 * modify it under the terms of the GNU Lesser General Public 00014 * License as published by the Free Software Foundation; either 00015 * version 2.1 of the License, or (at your option) any later version. 00016 * 00017 * FFmpeg is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00020 * Lesser General Public License for more details. 00021 * 00022 * You should have received a copy of the GNU Lesser General Public 00023 * License along with FFmpeg; if not, write to the Free Software 00024 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00025 */ 00026 00027 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm 00028 clobber bug - now it will work with 2.95.2 and also with -fPIC 00029 */ 00030 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00031 { 00032 __asm__ volatile( 00033 "lea (%3, %3), %%"REG_a" \n\t" 00034 "1: \n\t" 00035 "movq (%1), %%mm0 \n\t" 00036 "movq (%1, %3), %%mm1 \n\t" 00037 PAVGB" 1(%1), %%mm0 \n\t" 00038 PAVGB" 1(%1, %3), %%mm1 \n\t" 00039 "movq %%mm0, (%2) \n\t" 00040 "movq %%mm1, (%2, %3) \n\t" 00041 "add %%"REG_a", %1 \n\t" 00042 "add %%"REG_a", %2 \n\t" 00043 "movq (%1), %%mm0 \n\t" 00044 "movq (%1, %3), %%mm1 \n\t" 00045 PAVGB" 1(%1), %%mm0 \n\t" 00046 PAVGB" 1(%1, %3), %%mm1 \n\t" 00047 "add %%"REG_a", %1 \n\t" 00048 "movq %%mm0, (%2) \n\t" 00049 "movq %%mm1, (%2, %3) \n\t" 00050 "add %%"REG_a", %2 \n\t" 00051 "subl $4, %0 \n\t" 00052 "jnz 1b \n\t" 00053 :"+g"(h), "+S"(pixels), "+D"(block) 00054 :"r" ((x86_reg)line_size) 00055 :"%"REG_a, "memory"); 00056 } 00057 00058 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00059 { 00060 __asm__ volatile( 00061 "testl $1, %0 \n\t" 00062 " jz 1f \n\t" 00063 "movd (%1), %%mm0 \n\t" 00064 "movd (%2), %%mm1 \n\t" 00065 "add %4, %1 \n\t" 00066 "add $4, %2 \n\t" 00067 PAVGB" %%mm1, %%mm0 \n\t" 00068 "movd %%mm0, (%3) \n\t" 00069 "add %5, %3 \n\t" 00070 "decl %0 \n\t" 00071 "1: \n\t" 00072 "movd (%1), %%mm0 \n\t" 00073 "add %4, %1 \n\t" 00074 "movd (%1), %%mm1 \n\t" 00075 "movd (%2), %%mm2 \n\t" 00076 "movd 4(%2), %%mm3 \n\t" 00077 "add %4, %1 \n\t" 00078 PAVGB" %%mm2, %%mm0 \n\t" 00079 PAVGB" %%mm3, %%mm1 \n\t" 00080 "movd %%mm0, (%3) \n\t" 00081 "add %5, %3 \n\t" 00082 "movd %%mm1, (%3) \n\t" 00083 "add %5, %3 \n\t" 00084 "movd (%1), %%mm0 \n\t" 00085 "add %4, %1 \n\t" 00086 "movd (%1), %%mm1 \n\t" 00087 "movd 8(%2), %%mm2 \n\t" 00088 "movd 12(%2), %%mm3 \n\t" 00089 "add %4, %1 \n\t" 00090 PAVGB" %%mm2, %%mm0 \n\t" 00091 PAVGB" %%mm3, %%mm1 \n\t" 00092 "movd %%mm0, (%3) \n\t" 00093 "add %5, %3 \n\t" 00094 "movd %%mm1, (%3) \n\t" 00095 "add %5, %3 \n\t" 00096 "add $16, %2 \n\t" 00097 "subl $4, %0 \n\t" 00098 "jnz 1b \n\t" 00099 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00101 #else 00102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00103 #endif 00104 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00105 :"memory"); 00106 } 00107 00108 00109 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00110 { 00111 __asm__ volatile( 00112 "testl $1, %0 \n\t" 00113 " jz 1f \n\t" 00114 "movq (%1), %%mm0 \n\t" 00115 "movq (%2), %%mm1 \n\t" 00116 "add %4, %1 \n\t" 00117 "add $8, %2 \n\t" 00118 PAVGB" %%mm1, %%mm0 \n\t" 00119 "movq %%mm0, (%3) \n\t" 00120 "add %5, %3 \n\t" 00121 "decl %0 \n\t" 00122 "1: \n\t" 00123 "movq (%1), %%mm0 \n\t" 00124 "add %4, %1 \n\t" 00125 "movq (%1), %%mm1 \n\t" 00126 "add %4, %1 \n\t" 00127 PAVGB" (%2), %%mm0 \n\t" 00128 PAVGB" 8(%2), %%mm1 \n\t" 00129 "movq %%mm0, (%3) \n\t" 00130 "add %5, %3 \n\t" 00131 "movq %%mm1, (%3) \n\t" 00132 "add %5, %3 \n\t" 00133 "movq (%1), %%mm0 \n\t" 00134 "add %4, %1 \n\t" 00135 "movq (%1), %%mm1 \n\t" 00136 "add %4, %1 \n\t" 00137 PAVGB" 16(%2), %%mm0 \n\t" 00138 PAVGB" 24(%2), %%mm1 \n\t" 00139 "movq %%mm0, (%3) \n\t" 00140 "add %5, %3 \n\t" 00141 "movq %%mm1, (%3) \n\t" 00142 "add %5, %3 \n\t" 00143 "add $32, %2 \n\t" 00144 "subl $4, %0 \n\t" 00145 "jnz 1b \n\t" 00146 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00147 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00148 #else 00149 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00150 #endif 00151 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00152 :"memory"); 00153 //the following should be used, though better not with gcc ... 00154 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 00155 :"r"(src1Stride), "r"(dstStride) 00156 :"memory");*/ 00157 } 00158 00159 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00160 { 00161 __asm__ volatile( 00162 "pcmpeqb %%mm6, %%mm6 \n\t" 00163 "testl $1, %0 \n\t" 00164 " jz 1f \n\t" 00165 "movq (%1), %%mm0 \n\t" 00166 "movq (%2), %%mm1 \n\t" 00167 "add %4, %1 \n\t" 00168 "add $8, %2 \n\t" 00169 "pxor %%mm6, %%mm0 \n\t" 00170 "pxor %%mm6, %%mm1 \n\t" 00171 PAVGB" %%mm1, %%mm0 \n\t" 00172 "pxor %%mm6, %%mm0 \n\t" 00173 "movq %%mm0, (%3) \n\t" 00174 "add %5, %3 \n\t" 00175 "decl %0 \n\t" 00176 "1: \n\t" 00177 "movq (%1), %%mm0 \n\t" 00178 "add %4, %1 \n\t" 00179 "movq (%1), %%mm1 \n\t" 00180 "add %4, %1 \n\t" 00181 "movq (%2), %%mm2 \n\t" 00182 "movq 8(%2), %%mm3 \n\t" 00183 "pxor %%mm6, %%mm0 \n\t" 00184 "pxor %%mm6, %%mm1 \n\t" 00185 "pxor %%mm6, %%mm2 \n\t" 00186 "pxor %%mm6, %%mm3 \n\t" 00187 PAVGB" %%mm2, %%mm0 \n\t" 00188 PAVGB" %%mm3, %%mm1 \n\t" 00189 "pxor %%mm6, %%mm0 \n\t" 00190 "pxor %%mm6, %%mm1 \n\t" 00191 "movq %%mm0, (%3) \n\t" 00192 "add %5, %3 \n\t" 00193 "movq %%mm1, (%3) \n\t" 00194 "add %5, %3 \n\t" 00195 "movq (%1), %%mm0 \n\t" 00196 "add %4, %1 \n\t" 00197 "movq (%1), %%mm1 \n\t" 00198 "add %4, %1 \n\t" 00199 "movq 16(%2), %%mm2 \n\t" 00200 "movq 24(%2), %%mm3 \n\t" 00201 "pxor %%mm6, %%mm0 \n\t" 00202 "pxor %%mm6, %%mm1 \n\t" 00203 "pxor %%mm6, %%mm2 \n\t" 00204 "pxor %%mm6, %%mm3 \n\t" 00205 PAVGB" %%mm2, %%mm0 \n\t" 00206 PAVGB" %%mm3, %%mm1 \n\t" 00207 "pxor %%mm6, %%mm0 \n\t" 00208 "pxor %%mm6, %%mm1 \n\t" 00209 "movq %%mm0, (%3) \n\t" 00210 "add %5, %3 \n\t" 00211 "movq %%mm1, (%3) \n\t" 00212 "add %5, %3 \n\t" 00213 "add $32, %2 \n\t" 00214 "subl $4, %0 \n\t" 00215 "jnz 1b \n\t" 00216 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00217 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00218 #else 00219 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00220 #endif 00221 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00222 :"memory"); 00223 //the following should be used, though better not with gcc ... 00224 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 00225 :"r"(src1Stride), "r"(dstStride) 00226 :"memory");*/ 00227 } 00228 00229 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00230 { 00231 __asm__ volatile( 00232 "testl $1, %0 \n\t" 00233 " jz 1f \n\t" 00234 "movd (%1), %%mm0 \n\t" 00235 "movd (%2), %%mm1 \n\t" 00236 "add %4, %1 \n\t" 00237 "add $4, %2 \n\t" 00238 PAVGB" %%mm1, %%mm0 \n\t" 00239 PAVGB" (%3), %%mm0 \n\t" 00240 "movd %%mm0, (%3) \n\t" 00241 "add %5, %3 \n\t" 00242 "decl %0 \n\t" 00243 "1: \n\t" 00244 "movd (%1), %%mm0 \n\t" 00245 "add %4, %1 \n\t" 00246 "movd (%1), %%mm1 \n\t" 00247 "add %4, %1 \n\t" 00248 PAVGB" (%2), %%mm0 \n\t" 00249 PAVGB" 4(%2), %%mm1 \n\t" 00250 PAVGB" (%3), %%mm0 \n\t" 00251 "movd %%mm0, (%3) \n\t" 00252 "add %5, %3 \n\t" 00253 PAVGB" (%3), %%mm1 \n\t" 00254 "movd %%mm1, (%3) \n\t" 00255 "add %5, %3 \n\t" 00256 "movd (%1), %%mm0 \n\t" 00257 "add %4, %1 \n\t" 00258 "movd (%1), %%mm1 \n\t" 00259 "add %4, %1 \n\t" 00260 PAVGB" 8(%2), %%mm0 \n\t" 00261 PAVGB" 12(%2), %%mm1 \n\t" 00262 PAVGB" (%3), %%mm0 \n\t" 00263 "movd %%mm0, (%3) \n\t" 00264 "add %5, %3 \n\t" 00265 PAVGB" (%3), %%mm1 \n\t" 00266 "movd %%mm1, (%3) \n\t" 00267 "add %5, %3 \n\t" 00268 "add $16, %2 \n\t" 00269 "subl $4, %0 \n\t" 00270 "jnz 1b \n\t" 00271 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00272 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00273 #else 00274 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00275 #endif 00276 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00277 :"memory"); 00278 } 00279 00280 00281 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00282 { 00283 __asm__ volatile( 00284 "testl $1, %0 \n\t" 00285 " jz 1f \n\t" 00286 "movq (%1), %%mm0 \n\t" 00287 "movq (%2), %%mm1 \n\t" 00288 "add %4, %1 \n\t" 00289 "add $8, %2 \n\t" 00290 PAVGB" %%mm1, %%mm0 \n\t" 00291 PAVGB" (%3), %%mm0 \n\t" 00292 "movq %%mm0, (%3) \n\t" 00293 "add %5, %3 \n\t" 00294 "decl %0 \n\t" 00295 "1: \n\t" 00296 "movq (%1), %%mm0 \n\t" 00297 "add %4, %1 \n\t" 00298 "movq (%1), %%mm1 \n\t" 00299 "add %4, %1 \n\t" 00300 PAVGB" (%2), %%mm0 \n\t" 00301 PAVGB" 8(%2), %%mm1 \n\t" 00302 PAVGB" (%3), %%mm0 \n\t" 00303 "movq %%mm0, (%3) \n\t" 00304 "add %5, %3 \n\t" 00305 PAVGB" (%3), %%mm1 \n\t" 00306 "movq %%mm1, (%3) \n\t" 00307 "add %5, %3 \n\t" 00308 "movq (%1), %%mm0 \n\t" 00309 "add %4, %1 \n\t" 00310 "movq (%1), %%mm1 \n\t" 00311 "add %4, %1 \n\t" 00312 PAVGB" 16(%2), %%mm0 \n\t" 00313 PAVGB" 24(%2), %%mm1 \n\t" 00314 PAVGB" (%3), %%mm0 \n\t" 00315 "movq %%mm0, (%3) \n\t" 00316 "add %5, %3 \n\t" 00317 PAVGB" (%3), %%mm1 \n\t" 00318 "movq %%mm1, (%3) \n\t" 00319 "add %5, %3 \n\t" 00320 "add $32, %2 \n\t" 00321 "subl $4, %0 \n\t" 00322 "jnz 1b \n\t" 00323 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00324 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00325 #else 00326 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00327 #endif 00328 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00329 :"memory"); 00330 //the following should be used, though better not with gcc ... 00331 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 00332 :"r"(src1Stride), "r"(dstStride) 00333 :"memory");*/ 00334 } 00335 00336 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00337 { 00338 __asm__ volatile( 00339 "lea (%3, %3), %%"REG_a" \n\t" 00340 "1: \n\t" 00341 "movq (%1), %%mm0 \n\t" 00342 "movq (%1, %3), %%mm1 \n\t" 00343 "movq 8(%1), %%mm2 \n\t" 00344 "movq 8(%1, %3), %%mm3 \n\t" 00345 PAVGB" 1(%1), %%mm0 \n\t" 00346 PAVGB" 1(%1, %3), %%mm1 \n\t" 00347 PAVGB" 9(%1), %%mm2 \n\t" 00348 PAVGB" 9(%1, %3), %%mm3 \n\t" 00349 "movq %%mm0, (%2) \n\t" 00350 "movq %%mm1, (%2, %3) \n\t" 00351 "movq %%mm2, 8(%2) \n\t" 00352 "movq %%mm3, 8(%2, %3) \n\t" 00353 "add %%"REG_a", %1 \n\t" 00354 "add %%"REG_a", %2 \n\t" 00355 "movq (%1), %%mm0 \n\t" 00356 "movq (%1, %3), %%mm1 \n\t" 00357 "movq 8(%1), %%mm2 \n\t" 00358 "movq 8(%1, %3), %%mm3 \n\t" 00359 PAVGB" 1(%1), %%mm0 \n\t" 00360 PAVGB" 1(%1, %3), %%mm1 \n\t" 00361 PAVGB" 9(%1), %%mm2 \n\t" 00362 PAVGB" 9(%1, %3), %%mm3 \n\t" 00363 "add %%"REG_a", %1 \n\t" 00364 "movq %%mm0, (%2) \n\t" 00365 "movq %%mm1, (%2, %3) \n\t" 00366 "movq %%mm2, 8(%2) \n\t" 00367 "movq %%mm3, 8(%2, %3) \n\t" 00368 "add %%"REG_a", %2 \n\t" 00369 "subl $4, %0 \n\t" 00370 "jnz 1b \n\t" 00371 :"+g"(h), "+S"(pixels), "+D"(block) 00372 :"r" ((x86_reg)line_size) 00373 :"%"REG_a, "memory"); 00374 } 00375 00376 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00377 { 00378 __asm__ volatile( 00379 "testl $1, %0 \n\t" 00380 " jz 1f \n\t" 00381 "movq (%1), %%mm0 \n\t" 00382 "movq 8(%1), %%mm1 \n\t" 00383 PAVGB" (%2), %%mm0 \n\t" 00384 PAVGB" 8(%2), %%mm1 \n\t" 00385 "add %4, %1 \n\t" 00386 "add $16, %2 \n\t" 00387 "movq %%mm0, (%3) \n\t" 00388 "movq %%mm1, 8(%3) \n\t" 00389 "add %5, %3 \n\t" 00390 "decl %0 \n\t" 00391 "1: \n\t" 00392 "movq (%1), %%mm0 \n\t" 00393 "movq 8(%1), %%mm1 \n\t" 00394 "add %4, %1 \n\t" 00395 PAVGB" (%2), %%mm0 \n\t" 00396 PAVGB" 8(%2), %%mm1 \n\t" 00397 "movq %%mm0, (%3) \n\t" 00398 "movq %%mm1, 8(%3) \n\t" 00399 "add %5, %3 \n\t" 00400 "movq (%1), %%mm0 \n\t" 00401 "movq 8(%1), %%mm1 \n\t" 00402 "add %4, %1 \n\t" 00403 PAVGB" 16(%2), %%mm0 \n\t" 00404 PAVGB" 24(%2), %%mm1 \n\t" 00405 "movq %%mm0, (%3) \n\t" 00406 "movq %%mm1, 8(%3) \n\t" 00407 "add %5, %3 \n\t" 00408 "add $32, %2 \n\t" 00409 "subl $2, %0 \n\t" 00410 "jnz 1b \n\t" 00411 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00412 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00413 #else 00414 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00415 #endif 00416 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00417 :"memory"); 00418 //the following should be used, though better not with gcc ... 00419 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 00420 :"r"(src1Stride), "r"(dstStride) 00421 :"memory");*/ 00422 } 00423 00424 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00425 { 00426 __asm__ volatile( 00427 "testl $1, %0 \n\t" 00428 " jz 1f \n\t" 00429 "movq (%1), %%mm0 \n\t" 00430 "movq 8(%1), %%mm1 \n\t" 00431 PAVGB" (%2), %%mm0 \n\t" 00432 PAVGB" 8(%2), %%mm1 \n\t" 00433 "add %4, %1 \n\t" 00434 "add $16, %2 \n\t" 00435 PAVGB" (%3), %%mm0 \n\t" 00436 PAVGB" 8(%3), %%mm1 \n\t" 00437 "movq %%mm0, (%3) \n\t" 00438 "movq %%mm1, 8(%3) \n\t" 00439 "add %5, %3 \n\t" 00440 "decl %0 \n\t" 00441 "1: \n\t" 00442 "movq (%1), %%mm0 \n\t" 00443 "movq 8(%1), %%mm1 \n\t" 00444 "add %4, %1 \n\t" 00445 PAVGB" (%2), %%mm0 \n\t" 00446 PAVGB" 8(%2), %%mm1 \n\t" 00447 PAVGB" (%3), %%mm0 \n\t" 00448 PAVGB" 8(%3), %%mm1 \n\t" 00449 "movq %%mm0, (%3) \n\t" 00450 "movq %%mm1, 8(%3) \n\t" 00451 "add %5, %3 \n\t" 00452 "movq (%1), %%mm0 \n\t" 00453 "movq 8(%1), %%mm1 \n\t" 00454 "add %4, %1 \n\t" 00455 PAVGB" 16(%2), %%mm0 \n\t" 00456 PAVGB" 24(%2), %%mm1 \n\t" 00457 PAVGB" (%3), %%mm0 \n\t" 00458 PAVGB" 8(%3), %%mm1 \n\t" 00459 "movq %%mm0, (%3) \n\t" 00460 "movq %%mm1, 8(%3) \n\t" 00461 "add %5, %3 \n\t" 00462 "add $32, %2 \n\t" 00463 "subl $2, %0 \n\t" 00464 "jnz 1b \n\t" 00465 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00466 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00467 #else 00468 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00469 #endif 00470 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00471 :"memory"); 00472 //the following should be used, though better not with gcc ... 00473 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 00474 :"r"(src1Stride), "r"(dstStride) 00475 :"memory");*/ 00476 } 00477 00478 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00479 { 00480 __asm__ volatile( 00481 "pcmpeqb %%mm6, %%mm6 \n\t" 00482 "testl $1, %0 \n\t" 00483 " jz 1f \n\t" 00484 "movq (%1), %%mm0 \n\t" 00485 "movq 8(%1), %%mm1 \n\t" 00486 "movq (%2), %%mm2 \n\t" 00487 "movq 8(%2), %%mm3 \n\t" 00488 "pxor %%mm6, %%mm0 \n\t" 00489 "pxor %%mm6, %%mm1 \n\t" 00490 "pxor %%mm6, %%mm2 \n\t" 00491 "pxor %%mm6, %%mm3 \n\t" 00492 PAVGB" %%mm2, %%mm0 \n\t" 00493 PAVGB" %%mm3, %%mm1 \n\t" 00494 "pxor %%mm6, %%mm0 \n\t" 00495 "pxor %%mm6, %%mm1 \n\t" 00496 "add %4, %1 \n\t" 00497 "add $16, %2 \n\t" 00498 "movq %%mm0, (%3) \n\t" 00499 "movq %%mm1, 8(%3) \n\t" 00500 "add %5, %3 \n\t" 00501 "decl %0 \n\t" 00502 "1: \n\t" 00503 "movq (%1), %%mm0 \n\t" 00504 "movq 8(%1), %%mm1 \n\t" 00505 "add %4, %1 \n\t" 00506 "movq (%2), %%mm2 \n\t" 00507 "movq 8(%2), %%mm3 \n\t" 00508 "pxor %%mm6, %%mm0 \n\t" 00509 "pxor %%mm6, %%mm1 \n\t" 00510 "pxor %%mm6, %%mm2 \n\t" 00511 "pxor %%mm6, %%mm3 \n\t" 00512 PAVGB" %%mm2, %%mm0 \n\t" 00513 PAVGB" %%mm3, %%mm1 \n\t" 00514 "pxor %%mm6, %%mm0 \n\t" 00515 "pxor %%mm6, %%mm1 \n\t" 00516 "movq %%mm0, (%3) \n\t" 00517 "movq %%mm1, 8(%3) \n\t" 00518 "add %5, %3 \n\t" 00519 "movq (%1), %%mm0 \n\t" 00520 "movq 8(%1), %%mm1 \n\t" 00521 "add %4, %1 \n\t" 00522 "movq 16(%2), %%mm2 \n\t" 00523 "movq 24(%2), %%mm3 \n\t" 00524 "pxor %%mm6, %%mm0 \n\t" 00525 "pxor %%mm6, %%mm1 \n\t" 00526 "pxor %%mm6, %%mm2 \n\t" 00527 "pxor %%mm6, %%mm3 \n\t" 00528 PAVGB" %%mm2, %%mm0 \n\t" 00529 PAVGB" %%mm3, %%mm1 \n\t" 00530 "pxor %%mm6, %%mm0 \n\t" 00531 "pxor %%mm6, %%mm1 \n\t" 00532 "movq %%mm0, (%3) \n\t" 00533 "movq %%mm1, 8(%3) \n\t" 00534 "add %5, %3 \n\t" 00535 "add $32, %2 \n\t" 00536 "subl $2, %0 \n\t" 00537 "jnz 1b \n\t" 00538 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00539 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00540 #else 00541 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00542 #endif 00543 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00544 :"memory"); 00545 //the following should be used, though better not with gcc ... 00546 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 00547 :"r"(src1Stride), "r"(dstStride) 00548 :"memory");*/ 00549 } 00550 00551 /* GL: this function does incorrect rounding if overflow */ 00552 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00553 { 00554 MOVQ_BONE(mm6); 00555 __asm__ volatile( 00556 "lea (%3, %3), %%"REG_a" \n\t" 00557 "1: \n\t" 00558 "movq (%1), %%mm0 \n\t" 00559 "movq (%1, %3), %%mm2 \n\t" 00560 "movq 1(%1), %%mm1 \n\t" 00561 "movq 1(%1, %3), %%mm3 \n\t" 00562 "add %%"REG_a", %1 \n\t" 00563 "psubusb %%mm6, %%mm0 \n\t" 00564 "psubusb %%mm6, %%mm2 \n\t" 00565 PAVGB" %%mm1, %%mm0 \n\t" 00566 PAVGB" %%mm3, %%mm2 \n\t" 00567 "movq %%mm0, (%2) \n\t" 00568 "movq %%mm2, (%2, %3) \n\t" 00569 "movq (%1), %%mm0 \n\t" 00570 "movq 1(%1), %%mm1 \n\t" 00571 "movq (%1, %3), %%mm2 \n\t" 00572 "movq 1(%1, %3), %%mm3 \n\t" 00573 "add %%"REG_a", %2 \n\t" 00574 "add %%"REG_a", %1 \n\t" 00575 "psubusb %%mm6, %%mm0 \n\t" 00576 "psubusb %%mm6, %%mm2 \n\t" 00577 PAVGB" %%mm1, %%mm0 \n\t" 00578 PAVGB" %%mm3, %%mm2 \n\t" 00579 "movq %%mm0, (%2) \n\t" 00580 "movq %%mm2, (%2, %3) \n\t" 00581 "add %%"REG_a", %2 \n\t" 00582 "subl $4, %0 \n\t" 00583 "jnz 1b \n\t" 00584 :"+g"(h), "+S"(pixels), "+D"(block) 00585 :"r" ((x86_reg)line_size) 00586 :"%"REG_a, "memory"); 00587 } 00588 00589 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00590 { 00591 __asm__ volatile( 00592 "lea (%3, %3), %%"REG_a" \n\t" 00593 "movq (%1), %%mm0 \n\t" 00594 "sub %3, %2 \n\t" 00595 "1: \n\t" 00596 "movq (%1, %3), %%mm1 \n\t" 00597 "movq (%1, %%"REG_a"), %%mm2 \n\t" 00598 "add %%"REG_a", %1 \n\t" 00599 PAVGB" %%mm1, %%mm0 \n\t" 00600 PAVGB" %%mm2, %%mm1 \n\t" 00601 "movq %%mm0, (%2, %3) \n\t" 00602 "movq %%mm1, (%2, %%"REG_a") \n\t" 00603 "movq (%1, %3), %%mm1 \n\t" 00604 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00605 "add %%"REG_a", %2 \n\t" 00606 "add %%"REG_a", %1 \n\t" 00607 PAVGB" %%mm1, %%mm2 \n\t" 00608 PAVGB" %%mm0, %%mm1 \n\t" 00609 "movq %%mm2, (%2, %3) \n\t" 00610 "movq %%mm1, (%2, %%"REG_a") \n\t" 00611 "add %%"REG_a", %2 \n\t" 00612 "subl $4, %0 \n\t" 00613 "jnz 1b \n\t" 00614 :"+g"(h), "+S"(pixels), "+D" (block) 00615 :"r" ((x86_reg)line_size) 00616 :"%"REG_a, "memory"); 00617 } 00618 00619 /* GL: this function does incorrect rounding if overflow */ 00620 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00621 { 00622 MOVQ_BONE(mm6); 00623 __asm__ volatile( 00624 "lea (%3, %3), %%"REG_a" \n\t" 00625 "movq (%1), %%mm0 \n\t" 00626 "sub %3, %2 \n\t" 00627 "1: \n\t" 00628 "movq (%1, %3), %%mm1 \n\t" 00629 "movq (%1, %%"REG_a"), %%mm2 \n\t" 00630 "add %%"REG_a", %1 \n\t" 00631 "psubusb %%mm6, %%mm1 \n\t" 00632 PAVGB" %%mm1, %%mm0 \n\t" 00633 PAVGB" %%mm2, %%mm1 \n\t" 00634 "movq %%mm0, (%2, %3) \n\t" 00635 "movq %%mm1, (%2, %%"REG_a") \n\t" 00636 "movq (%1, %3), %%mm1 \n\t" 00637 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00638 "add %%"REG_a", %2 \n\t" 00639 "add %%"REG_a", %1 \n\t" 00640 "psubusb %%mm6, %%mm1 \n\t" 00641 PAVGB" %%mm1, %%mm2 \n\t" 00642 PAVGB" %%mm0, %%mm1 \n\t" 00643 "movq %%mm2, (%2, %3) \n\t" 00644 "movq %%mm1, (%2, %%"REG_a") \n\t" 00645 "add %%"REG_a", %2 \n\t" 00646 "subl $4, %0 \n\t" 00647 "jnz 1b \n\t" 00648 :"+g"(h), "+S"(pixels), "+D" (block) 00649 :"r" ((x86_reg)line_size) 00650 :"%"REG_a, "memory"); 00651 } 00652 00653 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00654 { 00655 __asm__ volatile( 00656 "lea (%3, %3), %%"REG_a" \n\t" 00657 "1: \n\t" 00658 "movq (%2), %%mm0 \n\t" 00659 "movq (%2, %3), %%mm1 \n\t" 00660 PAVGB" (%1), %%mm0 \n\t" 00661 PAVGB" (%1, %3), %%mm1 \n\t" 00662 "movq %%mm0, (%2) \n\t" 00663 "movq %%mm1, (%2, %3) \n\t" 00664 "add %%"REG_a", %1 \n\t" 00665 "add %%"REG_a", %2 \n\t" 00666 "movq (%2), %%mm0 \n\t" 00667 "movq (%2, %3), %%mm1 \n\t" 00668 PAVGB" (%1), %%mm0 \n\t" 00669 PAVGB" (%1, %3), %%mm1 \n\t" 00670 "add %%"REG_a", %1 \n\t" 00671 "movq %%mm0, (%2) \n\t" 00672 "movq %%mm1, (%2, %3) \n\t" 00673 "add %%"REG_a", %2 \n\t" 00674 "subl $4, %0 \n\t" 00675 "jnz 1b \n\t" 00676 :"+g"(h), "+S"(pixels), "+D"(block) 00677 :"r" ((x86_reg)line_size) 00678 :"%"REG_a, "memory"); 00679 } 00680 00681 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00682 { 00683 __asm__ volatile( 00684 "lea (%3, %3), %%"REG_a" \n\t" 00685 "1: \n\t" 00686 "movq (%1), %%mm0 \n\t" 00687 "movq (%1, %3), %%mm2 \n\t" 00688 PAVGB" 1(%1), %%mm0 \n\t" 00689 PAVGB" 1(%1, %3), %%mm2 \n\t" 00690 PAVGB" (%2), %%mm0 \n\t" 00691 PAVGB" (%2, %3), %%mm2 \n\t" 00692 "add %%"REG_a", %1 \n\t" 00693 "movq %%mm0, (%2) \n\t" 00694 "movq %%mm2, (%2, %3) \n\t" 00695 "movq (%1), %%mm0 \n\t" 00696 "movq (%1, %3), %%mm2 \n\t" 00697 PAVGB" 1(%1), %%mm0 \n\t" 00698 PAVGB" 1(%1, %3), %%mm2 \n\t" 00699 "add %%"REG_a", %2 \n\t" 00700 "add %%"REG_a", %1 \n\t" 00701 PAVGB" (%2), %%mm0 \n\t" 00702 PAVGB" (%2, %3), %%mm2 \n\t" 00703 "movq %%mm0, (%2) \n\t" 00704 "movq %%mm2, (%2, %3) \n\t" 00705 "add %%"REG_a", %2 \n\t" 00706 "subl $4, %0 \n\t" 00707 "jnz 1b \n\t" 00708 :"+g"(h), "+S"(pixels), "+D"(block) 00709 :"r" ((x86_reg)line_size) 00710 :"%"REG_a, "memory"); 00711 } 00712 00713 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00714 { 00715 __asm__ volatile( 00716 "lea (%3, %3), %%"REG_a" \n\t" 00717 "movq (%1), %%mm0 \n\t" 00718 "sub %3, %2 \n\t" 00719 "1: \n\t" 00720 "movq (%1, %3), %%mm1 \n\t" 00721 "movq (%1, %%"REG_a"), %%mm2 \n\t" 00722 "add %%"REG_a", %1 \n\t" 00723 PAVGB" %%mm1, %%mm0 \n\t" 00724 PAVGB" %%mm2, %%mm1 \n\t" 00725 "movq (%2, %3), %%mm3 \n\t" 00726 "movq (%2, %%"REG_a"), %%mm4 \n\t" 00727 PAVGB" %%mm3, %%mm0 \n\t" 00728 PAVGB" %%mm4, %%mm1 \n\t" 00729 "movq %%mm0, (%2, %3) \n\t" 00730 "movq %%mm1, (%2, %%"REG_a") \n\t" 00731 "movq (%1, %3), %%mm1 \n\t" 00732 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00733 PAVGB" %%mm1, %%mm2 \n\t" 00734 PAVGB" %%mm0, %%mm1 \n\t" 00735 "add %%"REG_a", %2 \n\t" 00736 "add %%"REG_a", %1 \n\t" 00737 "movq (%2, %3), %%mm3 \n\t" 00738 "movq (%2, %%"REG_a"), %%mm4 \n\t" 00739 PAVGB" %%mm3, %%mm2 \n\t" 00740 PAVGB" %%mm4, %%mm1 \n\t" 00741 "movq %%mm2, (%2, %3) \n\t" 00742 "movq %%mm1, (%2, %%"REG_a") \n\t" 00743 "add %%"REG_a", %2 \n\t" 00744 "subl $4, %0 \n\t" 00745 "jnz 1b \n\t" 00746 :"+g"(h), "+S"(pixels), "+D"(block) 00747 :"r" ((x86_reg)line_size) 00748 :"%"REG_a, "memory"); 00749 } 00750 00751 /* Note this is not correctly rounded, but this function is only 00752 * used for B-frames so it does not matter. */ 00753 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00754 { 00755 MOVQ_BONE(mm6); 00756 __asm__ volatile( 00757 "lea (%3, %3), %%"REG_a" \n\t" 00758 "movq (%1), %%mm0 \n\t" 00759 PAVGB" 1(%1), %%mm0 \n\t" 00760 ASMALIGN(3) 00761 "1: \n\t" 00762 "movq (%1, %%"REG_a"), %%mm2 \n\t" 00763 "movq (%1, %3), %%mm1 \n\t" 00764 "psubusb %%mm6, %%mm2 \n\t" 00765 PAVGB" 1(%1, %3), %%mm1 \n\t" 00766 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" 00767 "add %%"REG_a", %1 \n\t" 00768 PAVGB" %%mm1, %%mm0 \n\t" 00769 PAVGB" %%mm2, %%mm1 \n\t" 00770 PAVGB" (%2), %%mm0 \n\t" 00771 PAVGB" (%2, %3), %%mm1 \n\t" 00772 "movq %%mm0, (%2) \n\t" 00773 "movq %%mm1, (%2, %3) \n\t" 00774 "movq (%1, %3), %%mm1 \n\t" 00775 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00776 PAVGB" 1(%1, %3), %%mm1 \n\t" 00777 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" 00778 "add %%"REG_a", %2 \n\t" 00779 "add %%"REG_a", %1 \n\t" 00780 PAVGB" %%mm1, %%mm2 \n\t" 00781 PAVGB" %%mm0, %%mm1 \n\t" 00782 PAVGB" (%2), %%mm2 \n\t" 00783 PAVGB" (%2, %3), %%mm1 \n\t" 00784 "movq %%mm2, (%2) \n\t" 00785 "movq %%mm1, (%2, %3) \n\t" 00786 "add %%"REG_a", %2 \n\t" 00787 "subl $4, %0 \n\t" 00788 "jnz 1b \n\t" 00789 :"+g"(h), "+S"(pixels), "+D"(block) 00790 :"r" ((x86_reg)line_size) 00791 :"%"REG_a, "memory"); 00792 } 00793 00794 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00795 { 00796 do { 00797 __asm__ volatile( 00798 "movd (%1), %%mm0 \n\t" 00799 "movd (%1, %2), %%mm1 \n\t" 00800 "movd (%1, %2, 2), %%mm2 \n\t" 00801 "movd (%1, %3), %%mm3 \n\t" 00802 PAVGB" (%0), %%mm0 \n\t" 00803 PAVGB" (%0, %2), %%mm1 \n\t" 00804 PAVGB" (%0, %2, 2), %%mm2 \n\t" 00805 PAVGB" (%0, %3), %%mm3 \n\t" 00806 "movd %%mm0, (%1) \n\t" 00807 "movd %%mm1, (%1, %2) \n\t" 00808 "movd %%mm2, (%1, %2, 2) \n\t" 00809 "movd %%mm3, (%1, %3) \n\t" 00810 ::"S"(pixels), "D"(block), 00811 "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size) 00812 :"memory"); 00813 block += 4*line_size; 00814 pixels += 4*line_size; 00815 h -= 4; 00816 } while(h > 0); 00817 } 00818 00819 //FIXME the following could be optimized too ... 00820 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00821 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); 00822 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); 00823 } 00824 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00825 DEF(put_pixels8_y2)(block , pixels , line_size, h); 00826 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); 00827 } 00828 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00829 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); 00830 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); 00831 } 00832 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00833 DEF(avg_pixels8)(block , pixels , line_size, h); 00834 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); 00835 } 00836 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00837 DEF(avg_pixels8_x2)(block , pixels , line_size, h); 00838 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); 00839 } 00840 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00841 DEF(avg_pixels8_y2)(block , pixels , line_size, h); 00842 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); 00843 } 00844 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00845 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); 00846 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); 00847 } 00848 00849 #define QPEL_2TAP_L3(OPNAME) \ 00850 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ 00851 __asm__ volatile(\ 00852 "1: \n\t"\ 00853 "movq (%1,%2), %%mm0 \n\t"\ 00854 "movq 8(%1,%2), %%mm1 \n\t"\ 00855 PAVGB" (%1,%3), %%mm0 \n\t"\ 00856 PAVGB" 8(%1,%3), %%mm1 \n\t"\ 00857 PAVGB" (%1), %%mm0 \n\t"\ 00858 PAVGB" 8(%1), %%mm1 \n\t"\ 00859 STORE_OP( (%1,%4),%%mm0)\ 00860 STORE_OP(8(%1,%4),%%mm1)\ 00861 "movq %%mm0, (%1,%4) \n\t"\ 00862 "movq %%mm1, 8(%1,%4) \n\t"\ 00863 "add %5, %1 \n\t"\ 00864 "decl %0 \n\t"\ 00865 "jnz 1b \n\t"\ 00866 :"+g"(h), "+r"(src)\ 00867 :"r"((x86_reg)off1), "r"((x86_reg)off2),\ 00868 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ 00869 :"memory"\ 00870 );\ 00871 }\ 00872 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ 00873 __asm__ volatile(\ 00874 "1: \n\t"\ 00875 "movq (%1,%2), %%mm0 \n\t"\ 00876 PAVGB" (%1,%3), %%mm0 \n\t"\ 00877 PAVGB" (%1), %%mm0 \n\t"\ 00878 STORE_OP((%1,%4),%%mm0)\ 00879 "movq %%mm0, (%1,%4) \n\t"\ 00880 "add %5, %1 \n\t"\ 00881 "decl %0 \n\t"\ 00882 "jnz 1b \n\t"\ 00883 :"+g"(h), "+r"(src)\ 00884 :"r"((x86_reg)off1), "r"((x86_reg)off2),\ 00885 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ 00886 :"memory"\ 00887 );\ 00888 } 00889 00890 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" 00891 QPEL_2TAP_L3(avg_) 00892 #undef STORE_OP 00893 #define STORE_OP(a,b) 00894 QPEL_2TAP_L3(put_) 00895 #undef STORE_OP 00896 #undef QPEL_2TAP_L3