Libav
|
00001 /* 00002 * DSP utils mmx functions are compiled twice for rnd/no_rnd 00003 * Copyright (c) 2000, 2001 Fabrice Bellard 00004 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> 00005 * 00006 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 00007 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> 00008 * and improved by Zdenek Kabelac <kabi@users.sf.net> 00009 * 00010 * This file is part of FFmpeg. 00011 * 00012 * FFmpeg is free software; you can redistribute it and/or 00013 * modify it under the terms of the GNU Lesser General Public 00014 * License as published by the Free Software Foundation; either 00015 * version 2.1 of the License, or (at your option) any later version. 00016 * 00017 * FFmpeg is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00020 * Lesser General Public License for more details. 00021 * 00022 * You should have received a copy of the GNU Lesser General Public 00023 * License along with FFmpeg; if not, write to the Free Software 00024 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00025 */ 00026 00027 // put_pixels 00028 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00029 { 00030 MOVQ_BFE(mm6); 00031 __asm__ volatile( 00032 "lea (%3, %3), %%"REG_a" \n\t" 00033 ASMALIGN(3) 00034 "1: \n\t" 00035 "movq (%1), %%mm0 \n\t" 00036 "movq 1(%1), %%mm1 \n\t" 00037 "movq (%1, %3), %%mm2 \n\t" 00038 "movq 1(%1, %3), %%mm3 \n\t" 00039 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00040 "movq %%mm4, (%2) \n\t" 00041 "movq %%mm5, (%2, %3) \n\t" 00042 "add %%"REG_a", %1 \n\t" 00043 "add %%"REG_a", %2 \n\t" 00044 "movq (%1), %%mm0 \n\t" 00045 "movq 1(%1), %%mm1 \n\t" 00046 "movq (%1, %3), %%mm2 \n\t" 00047 "movq 1(%1, %3), %%mm3 \n\t" 00048 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00049 "movq %%mm4, (%2) \n\t" 00050 "movq %%mm5, (%2, %3) \n\t" 00051 "add %%"REG_a", %1 \n\t" 00052 "add %%"REG_a", %2 \n\t" 00053 "subl $4, %0 \n\t" 00054 "jnz 1b \n\t" 00055 :"+g"(h), "+S"(pixels), "+D"(block) 00056 :"r"((x86_reg)line_size) 00057 :REG_a, "memory"); 00058 } 00059 00060 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00061 { 00062 MOVQ_BFE(mm6); 00063 __asm__ volatile( 00064 "testl $1, %0 \n\t" 00065 " jz 1f \n\t" 00066 "movq (%1), %%mm0 \n\t" 00067 "movq (%2), %%mm1 \n\t" 00068 "add %4, %1 \n\t" 00069 "add $8, %2 \n\t" 00070 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) 00071 "movq %%mm4, (%3) \n\t" 00072 "add %5, %3 \n\t" 00073 "decl %0 \n\t" 00074 ASMALIGN(3) 00075 "1: \n\t" 00076 "movq (%1), %%mm0 \n\t" 00077 "movq (%2), %%mm1 \n\t" 00078 "add %4, %1 \n\t" 00079 "movq (%1), %%mm2 \n\t" 00080 "movq 8(%2), %%mm3 \n\t" 00081 "add %4, %1 \n\t" 00082 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00083 "movq %%mm4, (%3) \n\t" 00084 "add %5, %3 \n\t" 00085 "movq %%mm5, (%3) \n\t" 00086 "add %5, %3 \n\t" 00087 "movq (%1), %%mm0 \n\t" 00088 "movq 16(%2), %%mm1 \n\t" 00089 "add %4, %1 \n\t" 00090 "movq (%1), %%mm2 \n\t" 00091 "movq 24(%2), %%mm3 \n\t" 00092 "add %4, %1 \n\t" 00093 "add $32, %2 \n\t" 00094 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00095 "movq %%mm4, (%3) \n\t" 00096 "add %5, %3 \n\t" 00097 "movq %%mm5, (%3) \n\t" 00098 "add %5, %3 \n\t" 00099 "subl $4, %0 \n\t" 00100 "jnz 1b \n\t" 00101 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00102 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00103 #else 00104 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00105 #endif 00106 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00107 :"memory"); 00108 } 00109 00110 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00111 { 00112 MOVQ_BFE(mm6); 00113 __asm__ volatile( 00114 "lea (%3, %3), %%"REG_a" \n\t" 00115 ASMALIGN(3) 00116 "1: \n\t" 00117 "movq (%1), %%mm0 \n\t" 00118 "movq 1(%1), %%mm1 \n\t" 00119 "movq (%1, %3), %%mm2 \n\t" 00120 "movq 1(%1, %3), %%mm3 \n\t" 00121 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00122 "movq %%mm4, (%2) \n\t" 00123 "movq %%mm5, (%2, %3) \n\t" 00124 "movq 8(%1), %%mm0 \n\t" 00125 "movq 9(%1), %%mm1 \n\t" 00126 "movq 8(%1, %3), %%mm2 \n\t" 00127 "movq 9(%1, %3), %%mm3 \n\t" 00128 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00129 "movq %%mm4, 8(%2) \n\t" 00130 "movq %%mm5, 8(%2, %3) \n\t" 00131 "add %%"REG_a", %1 \n\t" 00132 "add %%"REG_a", %2 \n\t" 00133 "movq (%1), %%mm0 \n\t" 00134 "movq 1(%1), %%mm1 \n\t" 00135 "movq (%1, %3), %%mm2 \n\t" 00136 "movq 1(%1, %3), %%mm3 \n\t" 00137 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00138 "movq %%mm4, (%2) \n\t" 00139 "movq %%mm5, (%2, %3) \n\t" 00140 "movq 8(%1), %%mm0 \n\t" 00141 "movq 9(%1), %%mm1 \n\t" 00142 "movq 8(%1, %3), %%mm2 \n\t" 00143 "movq 9(%1, %3), %%mm3 \n\t" 00144 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00145 "movq %%mm4, 8(%2) \n\t" 00146 "movq %%mm5, 8(%2, %3) \n\t" 00147 "add %%"REG_a", %1 \n\t" 00148 "add %%"REG_a", %2 \n\t" 00149 "subl $4, %0 \n\t" 00150 "jnz 1b \n\t" 00151 :"+g"(h), "+S"(pixels), "+D"(block) 00152 :"r"((x86_reg)line_size) 00153 :REG_a, "memory"); 00154 } 00155 00156 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00157 { 00158 MOVQ_BFE(mm6); 00159 __asm__ volatile( 00160 "testl $1, %0 \n\t" 00161 " jz 1f \n\t" 00162 "movq (%1), %%mm0 \n\t" 00163 "movq (%2), %%mm1 \n\t" 00164 "movq 8(%1), %%mm2 \n\t" 00165 "movq 8(%2), %%mm3 \n\t" 00166 "add %4, %1 \n\t" 00167 "add $16, %2 \n\t" 00168 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00169 "movq %%mm4, (%3) \n\t" 00170 "movq %%mm5, 8(%3) \n\t" 00171 "add %5, %3 \n\t" 00172 "decl %0 \n\t" 00173 ASMALIGN(3) 00174 "1: \n\t" 00175 "movq (%1), %%mm0 \n\t" 00176 "movq (%2), %%mm1 \n\t" 00177 "movq 8(%1), %%mm2 \n\t" 00178 "movq 8(%2), %%mm3 \n\t" 00179 "add %4, %1 \n\t" 00180 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00181 "movq %%mm4, (%3) \n\t" 00182 "movq %%mm5, 8(%3) \n\t" 00183 "add %5, %3 \n\t" 00184 "movq (%1), %%mm0 \n\t" 00185 "movq 16(%2), %%mm1 \n\t" 00186 "movq 8(%1), %%mm2 \n\t" 00187 "movq 24(%2), %%mm3 \n\t" 00188 "add %4, %1 \n\t" 00189 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 00190 "movq %%mm4, (%3) \n\t" 00191 "movq %%mm5, 8(%3) \n\t" 00192 "add %5, %3 \n\t" 00193 "add $32, %2 \n\t" 00194 "subl $2, %0 \n\t" 00195 "jnz 1b \n\t" 00196 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 00197 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00198 #else 00199 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 00200 #endif 00201 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 00202 :"memory"); 00203 } 00204 00205 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00206 { 00207 MOVQ_BFE(mm6); 00208 __asm__ volatile( 00209 "lea (%3, %3), %%"REG_a" \n\t" 00210 "movq (%1), %%mm0 \n\t" 00211 ASMALIGN(3) 00212 "1: \n\t" 00213 "movq (%1, %3), %%mm1 \n\t" 00214 "movq (%1, %%"REG_a"),%%mm2 \n\t" 00215 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) 00216 "movq %%mm4, (%2) \n\t" 00217 "movq %%mm5, (%2, %3) \n\t" 00218 "add %%"REG_a", %1 \n\t" 00219 "add %%"REG_a", %2 \n\t" 00220 "movq (%1, %3), %%mm1 \n\t" 00221 "movq (%1, %%"REG_a"),%%mm0 \n\t" 00222 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) 00223 "movq %%mm4, (%2) \n\t" 00224 "movq %%mm5, (%2, %3) \n\t" 00225 "add %%"REG_a", %1 \n\t" 00226 "add %%"REG_a", %2 \n\t" 00227 "subl $4, %0 \n\t" 00228 "jnz 1b \n\t" 00229 :"+g"(h), "+S"(pixels), "+D"(block) 00230 :"r"((x86_reg)line_size) 00231 :REG_a, "memory"); 00232 } 00233 00234 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00235 { 00236 MOVQ_ZERO(mm7); 00237 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version 00238 __asm__ volatile( 00239 "movq (%1), %%mm0 \n\t" 00240 "movq 1(%1), %%mm4 \n\t" 00241 "movq %%mm0, %%mm1 \n\t" 00242 "movq %%mm4, %%mm5 \n\t" 00243 "punpcklbw %%mm7, %%mm0 \n\t" 00244 "punpcklbw %%mm7, %%mm4 \n\t" 00245 "punpckhbw %%mm7, %%mm1 \n\t" 00246 "punpckhbw %%mm7, %%mm5 \n\t" 00247 "paddusw %%mm0, %%mm4 \n\t" 00248 "paddusw %%mm1, %%mm5 \n\t" 00249 "xor %%"REG_a", %%"REG_a" \n\t" 00250 "add %3, %1 \n\t" 00251 ASMALIGN(3) 00252 "1: \n\t" 00253 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00254 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" 00255 "movq %%mm0, %%mm1 \n\t" 00256 "movq %%mm2, %%mm3 \n\t" 00257 "punpcklbw %%mm7, %%mm0 \n\t" 00258 "punpcklbw %%mm7, %%mm2 \n\t" 00259 "punpckhbw %%mm7, %%mm1 \n\t" 00260 "punpckhbw %%mm7, %%mm3 \n\t" 00261 "paddusw %%mm2, %%mm0 \n\t" 00262 "paddusw %%mm3, %%mm1 \n\t" 00263 "paddusw %%mm6, %%mm4 \n\t" 00264 "paddusw %%mm6, %%mm5 \n\t" 00265 "paddusw %%mm0, %%mm4 \n\t" 00266 "paddusw %%mm1, %%mm5 \n\t" 00267 "psrlw $2, %%mm4 \n\t" 00268 "psrlw $2, %%mm5 \n\t" 00269 "packuswb %%mm5, %%mm4 \n\t" 00270 "movq %%mm4, (%2, %%"REG_a") \n\t" 00271 "add %3, %%"REG_a" \n\t" 00272 00273 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 00274 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" 00275 "movq %%mm2, %%mm3 \n\t" 00276 "movq %%mm4, %%mm5 \n\t" 00277 "punpcklbw %%mm7, %%mm2 \n\t" 00278 "punpcklbw %%mm7, %%mm4 \n\t" 00279 "punpckhbw %%mm7, %%mm3 \n\t" 00280 "punpckhbw %%mm7, %%mm5 \n\t" 00281 "paddusw %%mm2, %%mm4 \n\t" 00282 "paddusw %%mm3, %%mm5 \n\t" 00283 "paddusw %%mm6, %%mm0 \n\t" 00284 "paddusw %%mm6, %%mm1 \n\t" 00285 "paddusw %%mm4, %%mm0 \n\t" 00286 "paddusw %%mm5, %%mm1 \n\t" 00287 "psrlw $2, %%mm0 \n\t" 00288 "psrlw $2, %%mm1 \n\t" 00289 "packuswb %%mm1, %%mm0 \n\t" 00290 "movq %%mm0, (%2, %%"REG_a") \n\t" 00291 "add %3, %%"REG_a" \n\t" 00292 00293 "subl $2, %0 \n\t" 00294 "jnz 1b \n\t" 00295 :"+g"(h), "+S"(pixels) 00296 :"D"(block), "r"((x86_reg)line_size) 00297 :REG_a, "memory"); 00298 } 00299 00300 // avg_pixels 00301 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00302 { 00303 MOVQ_BFE(mm6); 00304 JUMPALIGN(); 00305 do { 00306 __asm__ volatile( 00307 "movd %0, %%mm0 \n\t" 00308 "movd %1, %%mm1 \n\t" 00309 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) 00310 "movd %%mm2, %0 \n\t" 00311 :"+m"(*block) 00312 :"m"(*pixels) 00313 :"memory"); 00314 pixels += line_size; 00315 block += line_size; 00316 } 00317 while (--h); 00318 } 00319 00320 // in case more speed is needed - unroling would certainly help 00321 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00322 { 00323 MOVQ_BFE(mm6); 00324 JUMPALIGN(); 00325 do { 00326 __asm__ volatile( 00327 "movq %0, %%mm0 \n\t" 00328 "movq %1, %%mm1 \n\t" 00329 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) 00330 "movq %%mm2, %0 \n\t" 00331 :"+m"(*block) 00332 :"m"(*pixels) 00333 :"memory"); 00334 pixels += line_size; 00335 block += line_size; 00336 } 00337 while (--h); 00338 } 00339 00340 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00341 { 00342 MOVQ_BFE(mm6); 00343 JUMPALIGN(); 00344 do { 00345 __asm__ volatile( 00346 "movq %0, %%mm0 \n\t" 00347 "movq %1, %%mm1 \n\t" 00348 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) 00349 "movq %%mm2, %0 \n\t" 00350 "movq 8%0, %%mm0 \n\t" 00351 "movq 8%1, %%mm1 \n\t" 00352 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) 00353 "movq %%mm2, 8%0 \n\t" 00354 :"+m"(*block) 00355 :"m"(*pixels) 00356 :"memory"); 00357 pixels += line_size; 00358 block += line_size; 00359 } 00360 while (--h); 00361 } 00362 00363 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00364 { 00365 MOVQ_BFE(mm6); 00366 JUMPALIGN(); 00367 do { 00368 __asm__ volatile( 00369 "movq %1, %%mm0 \n\t" 00370 "movq 1%1, %%mm1 \n\t" 00371 "movq %0, %%mm3 \n\t" 00372 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 00373 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) 00374 "movq %%mm0, %0 \n\t" 00375 :"+m"(*block) 00376 :"m"(*pixels) 00377 :"memory"); 00378 pixels += line_size; 00379 block += line_size; 00380 } while (--h); 00381 } 00382 00383 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00384 { 00385 MOVQ_BFE(mm6); 00386 JUMPALIGN(); 00387 do { 00388 __asm__ volatile( 00389 "movq %1, %%mm0 \n\t" 00390 "movq %2, %%mm1 \n\t" 00391 "movq %0, %%mm3 \n\t" 00392 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 00393 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) 00394 "movq %%mm0, %0 \n\t" 00395 :"+m"(*dst) 00396 :"m"(*src1), "m"(*src2) 00397 :"memory"); 00398 dst += dstStride; 00399 src1 += src1Stride; 00400 src2 += 8; 00401 } while (--h); 00402 } 00403 00404 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00405 { 00406 MOVQ_BFE(mm6); 00407 JUMPALIGN(); 00408 do { 00409 __asm__ volatile( 00410 "movq %1, %%mm0 \n\t" 00411 "movq 1%1, %%mm1 \n\t" 00412 "movq %0, %%mm3 \n\t" 00413 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 00414 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) 00415 "movq %%mm0, %0 \n\t" 00416 "movq 8%1, %%mm0 \n\t" 00417 "movq 9%1, %%mm1 \n\t" 00418 "movq 8%0, %%mm3 \n\t" 00419 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 00420 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) 00421 "movq %%mm0, 8%0 \n\t" 00422 :"+m"(*block) 00423 :"m"(*pixels) 00424 :"memory"); 00425 pixels += line_size; 00426 block += line_size; 00427 } while (--h); 00428 } 00429 00430 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 00431 { 00432 MOVQ_BFE(mm6); 00433 JUMPALIGN(); 00434 do { 00435 __asm__ volatile( 00436 "movq %1, %%mm0 \n\t" 00437 "movq %2, %%mm1 \n\t" 00438 "movq %0, %%mm3 \n\t" 00439 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 00440 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) 00441 "movq %%mm0, %0 \n\t" 00442 "movq 8%1, %%mm0 \n\t" 00443 "movq 8%2, %%mm1 \n\t" 00444 "movq 8%0, %%mm3 \n\t" 00445 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 00446 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) 00447 "movq %%mm0, 8%0 \n\t" 00448 :"+m"(*dst) 00449 :"m"(*src1), "m"(*src2) 00450 :"memory"); 00451 dst += dstStride; 00452 src1 += src1Stride; 00453 src2 += 16; 00454 } while (--h); 00455 } 00456 00457 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00458 { 00459 MOVQ_BFE(mm6); 00460 __asm__ volatile( 00461 "lea (%3, %3), %%"REG_a" \n\t" 00462 "movq (%1), %%mm0 \n\t" 00463 ASMALIGN(3) 00464 "1: \n\t" 00465 "movq (%1, %3), %%mm1 \n\t" 00466 "movq (%1, %%"REG_a"), %%mm2 \n\t" 00467 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) 00468 "movq (%2), %%mm3 \n\t" 00469 OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) 00470 "movq (%2, %3), %%mm3 \n\t" 00471 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) 00472 "movq %%mm0, (%2) \n\t" 00473 "movq %%mm1, (%2, %3) \n\t" 00474 "add %%"REG_a", %1 \n\t" 00475 "add %%"REG_a", %2 \n\t" 00476 00477 "movq (%1, %3), %%mm1 \n\t" 00478 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00479 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) 00480 "movq (%2), %%mm3 \n\t" 00481 OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) 00482 "movq (%2, %3), %%mm3 \n\t" 00483 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) 00484 "movq %%mm2, (%2) \n\t" 00485 "movq %%mm1, (%2, %3) \n\t" 00486 "add %%"REG_a", %1 \n\t" 00487 "add %%"REG_a", %2 \n\t" 00488 00489 "subl $4, %0 \n\t" 00490 "jnz 1b \n\t" 00491 :"+g"(h), "+S"(pixels), "+D"(block) 00492 :"r"((x86_reg)line_size) 00493 :REG_a, "memory"); 00494 } 00495 00496 // this routine is 'slightly' suboptimal but mostly unused 00497 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00498 { 00499 MOVQ_ZERO(mm7); 00500 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version 00501 __asm__ volatile( 00502 "movq (%1), %%mm0 \n\t" 00503 "movq 1(%1), %%mm4 \n\t" 00504 "movq %%mm0, %%mm1 \n\t" 00505 "movq %%mm4, %%mm5 \n\t" 00506 "punpcklbw %%mm7, %%mm0 \n\t" 00507 "punpcklbw %%mm7, %%mm4 \n\t" 00508 "punpckhbw %%mm7, %%mm1 \n\t" 00509 "punpckhbw %%mm7, %%mm5 \n\t" 00510 "paddusw %%mm0, %%mm4 \n\t" 00511 "paddusw %%mm1, %%mm5 \n\t" 00512 "xor %%"REG_a", %%"REG_a" \n\t" 00513 "add %3, %1 \n\t" 00514 ASMALIGN(3) 00515 "1: \n\t" 00516 "movq (%1, %%"REG_a"), %%mm0 \n\t" 00517 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" 00518 "movq %%mm0, %%mm1 \n\t" 00519 "movq %%mm2, %%mm3 \n\t" 00520 "punpcklbw %%mm7, %%mm0 \n\t" 00521 "punpcklbw %%mm7, %%mm2 \n\t" 00522 "punpckhbw %%mm7, %%mm1 \n\t" 00523 "punpckhbw %%mm7, %%mm3 \n\t" 00524 "paddusw %%mm2, %%mm0 \n\t" 00525 "paddusw %%mm3, %%mm1 \n\t" 00526 "paddusw %%mm6, %%mm4 \n\t" 00527 "paddusw %%mm6, %%mm5 \n\t" 00528 "paddusw %%mm0, %%mm4 \n\t" 00529 "paddusw %%mm1, %%mm5 \n\t" 00530 "psrlw $2, %%mm4 \n\t" 00531 "psrlw $2, %%mm5 \n\t" 00532 "movq (%2, %%"REG_a"), %%mm3 \n\t" 00533 "packuswb %%mm5, %%mm4 \n\t" 00534 "pcmpeqd %%mm2, %%mm2 \n\t" 00535 "paddb %%mm2, %%mm2 \n\t" 00536 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) 00537 "movq %%mm5, (%2, %%"REG_a") \n\t" 00538 "add %3, %%"REG_a" \n\t" 00539 00540 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 00541 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" 00542 "movq %%mm2, %%mm3 \n\t" 00543 "movq %%mm4, %%mm5 \n\t" 00544 "punpcklbw %%mm7, %%mm2 \n\t" 00545 "punpcklbw %%mm7, %%mm4 \n\t" 00546 "punpckhbw %%mm7, %%mm3 \n\t" 00547 "punpckhbw %%mm7, %%mm5 \n\t" 00548 "paddusw %%mm2, %%mm4 \n\t" 00549 "paddusw %%mm3, %%mm5 \n\t" 00550 "paddusw %%mm6, %%mm0 \n\t" 00551 "paddusw %%mm6, %%mm1 \n\t" 00552 "paddusw %%mm4, %%mm0 \n\t" 00553 "paddusw %%mm5, %%mm1 \n\t" 00554 "psrlw $2, %%mm0 \n\t" 00555 "psrlw $2, %%mm1 \n\t" 00556 "movq (%2, %%"REG_a"), %%mm3 \n\t" 00557 "packuswb %%mm1, %%mm0 \n\t" 00558 "pcmpeqd %%mm2, %%mm2 \n\t" 00559 "paddb %%mm2, %%mm2 \n\t" 00560 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) 00561 "movq %%mm1, (%2, %%"REG_a") \n\t" 00562 "add %3, %%"REG_a" \n\t" 00563 00564 "subl $2, %0 \n\t" 00565 "jnz 1b \n\t" 00566 :"+g"(h), "+S"(pixels) 00567 :"D"(block), "r"((x86_reg)line_size) 00568 :REG_a, "memory"); 00569 } 00570 00571 //FIXME optimize 00572 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00573 DEF(put, pixels8_y2)(block , pixels , line_size, h); 00574 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); 00575 } 00576 00577 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00578 DEF(put, pixels8_xy2)(block , pixels , line_size, h); 00579 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); 00580 } 00581 00582 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00583 DEF(avg, pixels8_y2)(block , pixels , line_size, h); 00584 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); 00585 } 00586 00587 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 00588 DEF(avg, pixels8_xy2)(block , pixels , line_size, h); 00589 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); 00590 }