Libav 0.7.1
|
00001 /* 00002 * MMX optimized DSP utils 00003 * Copyright (c) 2000, 2001 Fabrice Bellard 00004 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 00005 * 00006 * This file is part of Libav. 00007 * 00008 * Libav is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * Libav is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with Libav; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 * 00022 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 00023 */ 00024 00025 #include "libavutil/cpu.h" 00026 #include "libavutil/x86_cpu.h" 00027 #include "libavcodec/dsputil.h" 00028 #include "libavcodec/mpegvideo.h" 00029 #include "libavcodec/mathops.h" 00030 #include "dsputil_mmx.h" 00031 00032 00033 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 00034 { 00035 __asm__ volatile( 00036 "mov $-128, %%"REG_a" \n\t" 00037 "pxor %%mm7, %%mm7 \n\t" 00038 ".p2align 4 \n\t" 00039 "1: \n\t" 00040 "movq (%0), %%mm0 \n\t" 00041 "movq (%0, %2), %%mm2 \n\t" 00042 "movq %%mm0, %%mm1 \n\t" 00043 "movq %%mm2, %%mm3 \n\t" 00044 "punpcklbw %%mm7, %%mm0 \n\t" 00045 "punpckhbw %%mm7, %%mm1 \n\t" 00046 "punpcklbw %%mm7, %%mm2 \n\t" 00047 "punpckhbw %%mm7, %%mm3 \n\t" 00048 "movq %%mm0, (%1, %%"REG_a") \n\t" 00049 "movq %%mm1, 8(%1, %%"REG_a") \n\t" 00050 "movq %%mm2, 16(%1, %%"REG_a") \n\t" 00051 "movq %%mm3, 24(%1, %%"REG_a") \n\t" 00052 "add %3, %0 \n\t" 00053 "add $32, %%"REG_a" \n\t" 00054 "js 1b \n\t" 00055 : "+r" (pixels) 00056 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) 00057 : "%"REG_a 00058 ); 00059 } 00060 00061 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) 00062 { 00063 __asm__ volatile( 00064 "pxor %%xmm4, %%xmm4 \n\t" 00065 "movq (%0), %%xmm0 \n\t" 00066 "movq (%0, %2), %%xmm1 \n\t" 00067 "movq (%0, %2,2), %%xmm2 \n\t" 00068 "movq (%0, %3), %%xmm3 \n\t" 00069 "lea (%0,%2,4), %0 \n\t" 00070 "punpcklbw %%xmm4, %%xmm0 \n\t" 00071 "punpcklbw %%xmm4, %%xmm1 \n\t" 00072 "punpcklbw %%xmm4, %%xmm2 \n\t" 00073 "punpcklbw %%xmm4, %%xmm3 \n\t" 00074 "movdqa %%xmm0, (%1) \n\t" 00075 "movdqa %%xmm1, 16(%1) \n\t" 00076 "movdqa %%xmm2, 32(%1) \n\t" 00077 "movdqa %%xmm3, 48(%1) \n\t" 00078 "movq (%0), %%xmm0 \n\t" 00079 "movq (%0, %2), %%xmm1 \n\t" 00080 "movq (%0, %2,2), %%xmm2 \n\t" 00081 "movq (%0, %3), %%xmm3 \n\t" 00082 "punpcklbw %%xmm4, %%xmm0 \n\t" 00083 "punpcklbw %%xmm4, %%xmm1 \n\t" 00084 "punpcklbw %%xmm4, %%xmm2 \n\t" 00085 "punpcklbw %%xmm4, %%xmm3 \n\t" 00086 "movdqa %%xmm0, 64(%1) \n\t" 00087 "movdqa %%xmm1, 80(%1) \n\t" 00088 "movdqa %%xmm2, 96(%1) \n\t" 00089 "movdqa %%xmm3, 112(%1) \n\t" 00090 : "+r" (pixels) 00091 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) 00092 ); 00093 } 00094 00095 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 00096 { 00097 __asm__ volatile( 00098 "pxor %%mm7, %%mm7 \n\t" 00099 "mov $-128, %%"REG_a" \n\t" 00100 ".p2align 4 \n\t" 00101 "1: \n\t" 00102 "movq (%0), %%mm0 \n\t" 00103 "movq (%1), %%mm2 \n\t" 00104 "movq %%mm0, %%mm1 \n\t" 00105 "movq %%mm2, %%mm3 \n\t" 00106 "punpcklbw %%mm7, %%mm0 \n\t" 00107 "punpckhbw %%mm7, %%mm1 \n\t" 00108 "punpcklbw %%mm7, %%mm2 \n\t" 00109 "punpckhbw %%mm7, %%mm3 \n\t" 00110 "psubw %%mm2, %%mm0 \n\t" 00111 "psubw %%mm3, %%mm1 \n\t" 00112 "movq %%mm0, (%2, %%"REG_a") \n\t" 00113 "movq %%mm1, 8(%2, %%"REG_a") \n\t" 00114 "add %3, %0 \n\t" 00115 "add %3, %1 \n\t" 00116 "add $16, %%"REG_a" \n\t" 00117 "jnz 1b \n\t" 00118 : "+r" (s1), "+r" (s2) 00119 : "r" (block+64), "r" ((x86_reg)stride) 00120 : "%"REG_a 00121 ); 00122 } 00123 00124 static int pix_sum16_mmx(uint8_t * pix, int line_size){ 00125 const int h=16; 00126 int sum; 00127 x86_reg index= -line_size*h; 00128 00129 __asm__ volatile( 00130 "pxor %%mm7, %%mm7 \n\t" 00131 "pxor %%mm6, %%mm6 \n\t" 00132 "1: \n\t" 00133 "movq (%2, %1), %%mm0 \n\t" 00134 "movq (%2, %1), %%mm1 \n\t" 00135 "movq 8(%2, %1), %%mm2 \n\t" 00136 "movq 8(%2, %1), %%mm3 \n\t" 00137 "punpcklbw %%mm7, %%mm0 \n\t" 00138 "punpckhbw %%mm7, %%mm1 \n\t" 00139 "punpcklbw %%mm7, %%mm2 \n\t" 00140 "punpckhbw %%mm7, %%mm3 \n\t" 00141 "paddw %%mm0, %%mm1 \n\t" 00142 "paddw %%mm2, %%mm3 \n\t" 00143 "paddw %%mm1, %%mm3 \n\t" 00144 "paddw %%mm3, %%mm6 \n\t" 00145 "add %3, %1 \n\t" 00146 " js 1b \n\t" 00147 "movq %%mm6, %%mm5 \n\t" 00148 "psrlq $32, %%mm6 \n\t" 00149 "paddw %%mm5, %%mm6 \n\t" 00150 "movq %%mm6, %%mm5 \n\t" 00151 "psrlq $16, %%mm6 \n\t" 00152 "paddw %%mm5, %%mm6 \n\t" 00153 "movd %%mm6, %0 \n\t" 00154 "andl $0xFFFF, %0 \n\t" 00155 : "=&r" (sum), "+r" (index) 00156 : "r" (pix - index), "r" ((x86_reg)line_size) 00157 ); 00158 00159 return sum; 00160 } 00161 00162 static int pix_norm1_mmx(uint8_t *pix, int line_size) { 00163 int tmp; 00164 __asm__ volatile ( 00165 "movl $16,%%ecx\n" 00166 "pxor %%mm0,%%mm0\n" 00167 "pxor %%mm7,%%mm7\n" 00168 "1:\n" 00169 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ 00170 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ 00171 00172 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ 00173 00174 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ 00175 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ 00176 00177 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ 00178 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ 00179 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ 00180 00181 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 00182 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 00183 00184 "pmaddwd %%mm3,%%mm3\n" 00185 "pmaddwd %%mm4,%%mm4\n" 00186 00187 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 00188 pix2^2+pix3^2+pix6^2+pix7^2) */ 00189 "paddd %%mm3,%%mm4\n" 00190 "paddd %%mm2,%%mm7\n" 00191 00192 "add %2, %0\n" 00193 "paddd %%mm4,%%mm7\n" 00194 "dec %%ecx\n" 00195 "jnz 1b\n" 00196 00197 "movq %%mm7,%%mm1\n" 00198 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 00199 "paddd %%mm7,%%mm1\n" 00200 "movd %%mm1,%1\n" 00201 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); 00202 return tmp; 00203 } 00204 00205 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00206 int tmp; 00207 __asm__ volatile ( 00208 "movl %4,%%ecx\n" 00209 "shr $1,%%ecx\n" 00210 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 00211 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 00212 "1:\n" 00213 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ 00214 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ 00215 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ 00216 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ 00217 00218 /* todo: mm1-mm2, mm3-mm4 */ 00219 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 00220 /* OR the results to get absolute difference */ 00221 "movq %%mm1,%%mm5\n" 00222 "movq %%mm3,%%mm6\n" 00223 "psubusb %%mm2,%%mm1\n" 00224 "psubusb %%mm4,%%mm3\n" 00225 "psubusb %%mm5,%%mm2\n" 00226 "psubusb %%mm6,%%mm4\n" 00227 00228 "por %%mm1,%%mm2\n" 00229 "por %%mm3,%%mm4\n" 00230 00231 /* now convert to 16-bit vectors so we can square them */ 00232 "movq %%mm2,%%mm1\n" 00233 "movq %%mm4,%%mm3\n" 00234 00235 "punpckhbw %%mm0,%%mm2\n" 00236 "punpckhbw %%mm0,%%mm4\n" 00237 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 00238 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 00239 00240 "pmaddwd %%mm2,%%mm2\n" 00241 "pmaddwd %%mm4,%%mm4\n" 00242 "pmaddwd %%mm1,%%mm1\n" 00243 "pmaddwd %%mm3,%%mm3\n" 00244 00245 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 00246 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 00247 00248 "paddd %%mm2,%%mm1\n" 00249 "paddd %%mm4,%%mm3\n" 00250 "paddd %%mm1,%%mm7\n" 00251 "paddd %%mm3,%%mm7\n" 00252 00253 "decl %%ecx\n" 00254 "jnz 1b\n" 00255 00256 "movq %%mm7,%%mm1\n" 00257 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 00258 "paddd %%mm7,%%mm1\n" 00259 "movd %%mm1,%2\n" 00260 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00261 : "r" ((x86_reg)line_size) , "m" (h) 00262 : "%ecx"); 00263 return tmp; 00264 } 00265 00266 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00267 int tmp; 00268 __asm__ volatile ( 00269 "movl %4,%%ecx\n" 00270 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 00271 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 00272 "1:\n" 00273 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ 00274 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ 00275 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ 00276 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ 00277 00278 /* todo: mm1-mm2, mm3-mm4 */ 00279 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 00280 /* OR the results to get absolute difference */ 00281 "movq %%mm1,%%mm5\n" 00282 "movq %%mm3,%%mm6\n" 00283 "psubusb %%mm2,%%mm1\n" 00284 "psubusb %%mm4,%%mm3\n" 00285 "psubusb %%mm5,%%mm2\n" 00286 "psubusb %%mm6,%%mm4\n" 00287 00288 "por %%mm1,%%mm2\n" 00289 "por %%mm3,%%mm4\n" 00290 00291 /* now convert to 16-bit vectors so we can square them */ 00292 "movq %%mm2,%%mm1\n" 00293 "movq %%mm4,%%mm3\n" 00294 00295 "punpckhbw %%mm0,%%mm2\n" 00296 "punpckhbw %%mm0,%%mm4\n" 00297 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 00298 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 00299 00300 "pmaddwd %%mm2,%%mm2\n" 00301 "pmaddwd %%mm4,%%mm4\n" 00302 "pmaddwd %%mm1,%%mm1\n" 00303 "pmaddwd %%mm3,%%mm3\n" 00304 00305 "add %3,%0\n" 00306 "add %3,%1\n" 00307 00308 "paddd %%mm2,%%mm1\n" 00309 "paddd %%mm4,%%mm3\n" 00310 "paddd %%mm1,%%mm7\n" 00311 "paddd %%mm3,%%mm7\n" 00312 00313 "decl %%ecx\n" 00314 "jnz 1b\n" 00315 00316 "movq %%mm7,%%mm1\n" 00317 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 00318 "paddd %%mm7,%%mm1\n" 00319 "movd %%mm1,%2\n" 00320 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00321 : "r" ((x86_reg)line_size) , "m" (h) 00322 : "%ecx"); 00323 return tmp; 00324 } 00325 00326 int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); 00327 00328 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 00329 int tmp; 00330 __asm__ volatile ( 00331 "movl %3,%%ecx\n" 00332 "pxor %%mm7,%%mm7\n" 00333 "pxor %%mm6,%%mm6\n" 00334 00335 "movq (%0),%%mm0\n" 00336 "movq %%mm0, %%mm1\n" 00337 "psllq $8, %%mm0\n" 00338 "psrlq $8, %%mm1\n" 00339 "psrlq $8, %%mm0\n" 00340 "movq %%mm0, %%mm2\n" 00341 "movq %%mm1, %%mm3\n" 00342 "punpcklbw %%mm7,%%mm0\n" 00343 "punpcklbw %%mm7,%%mm1\n" 00344 "punpckhbw %%mm7,%%mm2\n" 00345 "punpckhbw %%mm7,%%mm3\n" 00346 "psubw %%mm1, %%mm0\n" 00347 "psubw %%mm3, %%mm2\n" 00348 00349 "add %2,%0\n" 00350 00351 "movq (%0),%%mm4\n" 00352 "movq %%mm4, %%mm1\n" 00353 "psllq $8, %%mm4\n" 00354 "psrlq $8, %%mm1\n" 00355 "psrlq $8, %%mm4\n" 00356 "movq %%mm4, %%mm5\n" 00357 "movq %%mm1, %%mm3\n" 00358 "punpcklbw %%mm7,%%mm4\n" 00359 "punpcklbw %%mm7,%%mm1\n" 00360 "punpckhbw %%mm7,%%mm5\n" 00361 "punpckhbw %%mm7,%%mm3\n" 00362 "psubw %%mm1, %%mm4\n" 00363 "psubw %%mm3, %%mm5\n" 00364 "psubw %%mm4, %%mm0\n" 00365 "psubw %%mm5, %%mm2\n" 00366 "pxor %%mm3, %%mm3\n" 00367 "pxor %%mm1, %%mm1\n" 00368 "pcmpgtw %%mm0, %%mm3\n\t" 00369 "pcmpgtw %%mm2, %%mm1\n\t" 00370 "pxor %%mm3, %%mm0\n" 00371 "pxor %%mm1, %%mm2\n" 00372 "psubw %%mm3, %%mm0\n" 00373 "psubw %%mm1, %%mm2\n" 00374 "paddw %%mm0, %%mm2\n" 00375 "paddw %%mm2, %%mm6\n" 00376 00377 "add %2,%0\n" 00378 "1:\n" 00379 00380 "movq (%0),%%mm0\n" 00381 "movq %%mm0, %%mm1\n" 00382 "psllq $8, %%mm0\n" 00383 "psrlq $8, %%mm1\n" 00384 "psrlq $8, %%mm0\n" 00385 "movq %%mm0, %%mm2\n" 00386 "movq %%mm1, %%mm3\n" 00387 "punpcklbw %%mm7,%%mm0\n" 00388 "punpcklbw %%mm7,%%mm1\n" 00389 "punpckhbw %%mm7,%%mm2\n" 00390 "punpckhbw %%mm7,%%mm3\n" 00391 "psubw %%mm1, %%mm0\n" 00392 "psubw %%mm3, %%mm2\n" 00393 "psubw %%mm0, %%mm4\n" 00394 "psubw %%mm2, %%mm5\n" 00395 "pxor %%mm3, %%mm3\n" 00396 "pxor %%mm1, %%mm1\n" 00397 "pcmpgtw %%mm4, %%mm3\n\t" 00398 "pcmpgtw %%mm5, %%mm1\n\t" 00399 "pxor %%mm3, %%mm4\n" 00400 "pxor %%mm1, %%mm5\n" 00401 "psubw %%mm3, %%mm4\n" 00402 "psubw %%mm1, %%mm5\n" 00403 "paddw %%mm4, %%mm5\n" 00404 "paddw %%mm5, %%mm6\n" 00405 00406 "add %2,%0\n" 00407 00408 "movq (%0),%%mm4\n" 00409 "movq %%mm4, %%mm1\n" 00410 "psllq $8, %%mm4\n" 00411 "psrlq $8, %%mm1\n" 00412 "psrlq $8, %%mm4\n" 00413 "movq %%mm4, %%mm5\n" 00414 "movq %%mm1, %%mm3\n" 00415 "punpcklbw %%mm7,%%mm4\n" 00416 "punpcklbw %%mm7,%%mm1\n" 00417 "punpckhbw %%mm7,%%mm5\n" 00418 "punpckhbw %%mm7,%%mm3\n" 00419 "psubw %%mm1, %%mm4\n" 00420 "psubw %%mm3, %%mm5\n" 00421 "psubw %%mm4, %%mm0\n" 00422 "psubw %%mm5, %%mm2\n" 00423 "pxor %%mm3, %%mm3\n" 00424 "pxor %%mm1, %%mm1\n" 00425 "pcmpgtw %%mm0, %%mm3\n\t" 00426 "pcmpgtw %%mm2, %%mm1\n\t" 00427 "pxor %%mm3, %%mm0\n" 00428 "pxor %%mm1, %%mm2\n" 00429 "psubw %%mm3, %%mm0\n" 00430 "psubw %%mm1, %%mm2\n" 00431 "paddw %%mm0, %%mm2\n" 00432 "paddw %%mm2, %%mm6\n" 00433 00434 "add %2,%0\n" 00435 "subl $2, %%ecx\n" 00436 " jnz 1b\n" 00437 00438 "movq %%mm6, %%mm0\n" 00439 "punpcklwd %%mm7,%%mm0\n" 00440 "punpckhwd %%mm7,%%mm6\n" 00441 "paddd %%mm0, %%mm6\n" 00442 00443 "movq %%mm6,%%mm0\n" 00444 "psrlq $32, %%mm6\n" 00445 "paddd %%mm6,%%mm0\n" 00446 "movd %%mm0,%1\n" 00447 : "+r" (pix1), "=r"(tmp) 00448 : "r" ((x86_reg)line_size) , "g" (h-2) 00449 : "%ecx"); 00450 return tmp; 00451 } 00452 00453 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 00454 int tmp; 00455 uint8_t * pix= pix1; 00456 __asm__ volatile ( 00457 "movl %3,%%ecx\n" 00458 "pxor %%mm7,%%mm7\n" 00459 "pxor %%mm6,%%mm6\n" 00460 00461 "movq (%0),%%mm0\n" 00462 "movq 1(%0),%%mm1\n" 00463 "movq %%mm0, %%mm2\n" 00464 "movq %%mm1, %%mm3\n" 00465 "punpcklbw %%mm7,%%mm0\n" 00466 "punpcklbw %%mm7,%%mm1\n" 00467 "punpckhbw %%mm7,%%mm2\n" 00468 "punpckhbw %%mm7,%%mm3\n" 00469 "psubw %%mm1, %%mm0\n" 00470 "psubw %%mm3, %%mm2\n" 00471 00472 "add %2,%0\n" 00473 00474 "movq (%0),%%mm4\n" 00475 "movq 1(%0),%%mm1\n" 00476 "movq %%mm4, %%mm5\n" 00477 "movq %%mm1, %%mm3\n" 00478 "punpcklbw %%mm7,%%mm4\n" 00479 "punpcklbw %%mm7,%%mm1\n" 00480 "punpckhbw %%mm7,%%mm5\n" 00481 "punpckhbw %%mm7,%%mm3\n" 00482 "psubw %%mm1, %%mm4\n" 00483 "psubw %%mm3, %%mm5\n" 00484 "psubw %%mm4, %%mm0\n" 00485 "psubw %%mm5, %%mm2\n" 00486 "pxor %%mm3, %%mm3\n" 00487 "pxor %%mm1, %%mm1\n" 00488 "pcmpgtw %%mm0, %%mm3\n\t" 00489 "pcmpgtw %%mm2, %%mm1\n\t" 00490 "pxor %%mm3, %%mm0\n" 00491 "pxor %%mm1, %%mm2\n" 00492 "psubw %%mm3, %%mm0\n" 00493 "psubw %%mm1, %%mm2\n" 00494 "paddw %%mm0, %%mm2\n" 00495 "paddw %%mm2, %%mm6\n" 00496 00497 "add %2,%0\n" 00498 "1:\n" 00499 00500 "movq (%0),%%mm0\n" 00501 "movq 1(%0),%%mm1\n" 00502 "movq %%mm0, %%mm2\n" 00503 "movq %%mm1, %%mm3\n" 00504 "punpcklbw %%mm7,%%mm0\n" 00505 "punpcklbw %%mm7,%%mm1\n" 00506 "punpckhbw %%mm7,%%mm2\n" 00507 "punpckhbw %%mm7,%%mm3\n" 00508 "psubw %%mm1, %%mm0\n" 00509 "psubw %%mm3, %%mm2\n" 00510 "psubw %%mm0, %%mm4\n" 00511 "psubw %%mm2, %%mm5\n" 00512 "pxor %%mm3, %%mm3\n" 00513 "pxor %%mm1, %%mm1\n" 00514 "pcmpgtw %%mm4, %%mm3\n\t" 00515 "pcmpgtw %%mm5, %%mm1\n\t" 00516 "pxor %%mm3, %%mm4\n" 00517 "pxor %%mm1, %%mm5\n" 00518 "psubw %%mm3, %%mm4\n" 00519 "psubw %%mm1, %%mm5\n" 00520 "paddw %%mm4, %%mm5\n" 00521 "paddw %%mm5, %%mm6\n" 00522 00523 "add %2,%0\n" 00524 00525 "movq (%0),%%mm4\n" 00526 "movq 1(%0),%%mm1\n" 00527 "movq %%mm4, %%mm5\n" 00528 "movq %%mm1, %%mm3\n" 00529 "punpcklbw %%mm7,%%mm4\n" 00530 "punpcklbw %%mm7,%%mm1\n" 00531 "punpckhbw %%mm7,%%mm5\n" 00532 "punpckhbw %%mm7,%%mm3\n" 00533 "psubw %%mm1, %%mm4\n" 00534 "psubw %%mm3, %%mm5\n" 00535 "psubw %%mm4, %%mm0\n" 00536 "psubw %%mm5, %%mm2\n" 00537 "pxor %%mm3, %%mm3\n" 00538 "pxor %%mm1, %%mm1\n" 00539 "pcmpgtw %%mm0, %%mm3\n\t" 00540 "pcmpgtw %%mm2, %%mm1\n\t" 00541 "pxor %%mm3, %%mm0\n" 00542 "pxor %%mm1, %%mm2\n" 00543 "psubw %%mm3, %%mm0\n" 00544 "psubw %%mm1, %%mm2\n" 00545 "paddw %%mm0, %%mm2\n" 00546 "paddw %%mm2, %%mm6\n" 00547 00548 "add %2,%0\n" 00549 "subl $2, %%ecx\n" 00550 " jnz 1b\n" 00551 00552 "movq %%mm6, %%mm0\n" 00553 "punpcklwd %%mm7,%%mm0\n" 00554 "punpckhwd %%mm7,%%mm6\n" 00555 "paddd %%mm0, %%mm6\n" 00556 00557 "movq %%mm6,%%mm0\n" 00558 "psrlq $32, %%mm6\n" 00559 "paddd %%mm6,%%mm0\n" 00560 "movd %%mm0,%1\n" 00561 : "+r" (pix1), "=r"(tmp) 00562 : "r" ((x86_reg)line_size) , "g" (h-2) 00563 : "%ecx"); 00564 return tmp + hf_noise8_mmx(pix+8, line_size, h); 00565 } 00566 00567 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00568 MpegEncContext *c = p; 00569 int score1, score2; 00570 00571 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); 00572 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); 00573 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); 00574 00575 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 00576 else return score1 + FFABS(score2)*8; 00577 } 00578 00579 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00580 MpegEncContext *c = p; 00581 int score1= sse8_mmx(c, pix1, pix2, line_size, h); 00582 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); 00583 00584 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 00585 else return score1 + FFABS(score2)*8; 00586 } 00587 00588 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 00589 int tmp; 00590 00591 assert( (((int)pix) & 7) == 0); 00592 assert((line_size &7) ==0); 00593 00594 #define SUM(in0, in1, out0, out1) \ 00595 "movq (%0), %%mm2\n"\ 00596 "movq 8(%0), %%mm3\n"\ 00597 "add %2,%0\n"\ 00598 "movq %%mm2, " #out0 "\n"\ 00599 "movq %%mm3, " #out1 "\n"\ 00600 "psubusb " #in0 ", %%mm2\n"\ 00601 "psubusb " #in1 ", %%mm3\n"\ 00602 "psubusb " #out0 ", " #in0 "\n"\ 00603 "psubusb " #out1 ", " #in1 "\n"\ 00604 "por %%mm2, " #in0 "\n"\ 00605 "por %%mm3, " #in1 "\n"\ 00606 "movq " #in0 ", %%mm2\n"\ 00607 "movq " #in1 ", %%mm3\n"\ 00608 "punpcklbw %%mm7, " #in0 "\n"\ 00609 "punpcklbw %%mm7, " #in1 "\n"\ 00610 "punpckhbw %%mm7, %%mm2\n"\ 00611 "punpckhbw %%mm7, %%mm3\n"\ 00612 "paddw " #in1 ", " #in0 "\n"\ 00613 "paddw %%mm3, %%mm2\n"\ 00614 "paddw %%mm2, " #in0 "\n"\ 00615 "paddw " #in0 ", %%mm6\n" 00616 00617 00618 __asm__ volatile ( 00619 "movl %3,%%ecx\n" 00620 "pxor %%mm6,%%mm6\n" 00621 "pxor %%mm7,%%mm7\n" 00622 "movq (%0),%%mm0\n" 00623 "movq 8(%0),%%mm1\n" 00624 "add %2,%0\n" 00625 "jmp 2f\n" 00626 "1:\n" 00627 00628 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00629 "2:\n" 00630 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00631 00632 "subl $2, %%ecx\n" 00633 "jnz 1b\n" 00634 00635 "movq %%mm6,%%mm0\n" 00636 "psrlq $32, %%mm6\n" 00637 "paddw %%mm6,%%mm0\n" 00638 "movq %%mm0,%%mm6\n" 00639 "psrlq $16, %%mm0\n" 00640 "paddw %%mm6,%%mm0\n" 00641 "movd %%mm0,%1\n" 00642 : "+r" (pix), "=r"(tmp) 00643 : "r" ((x86_reg)line_size) , "m" (h) 00644 : "%ecx"); 00645 return tmp & 0xFFFF; 00646 } 00647 #undef SUM 00648 00649 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 00650 int tmp; 00651 00652 assert( (((int)pix) & 7) == 0); 00653 assert((line_size &7) ==0); 00654 00655 #define SUM(in0, in1, out0, out1) \ 00656 "movq (%0), " #out0 "\n"\ 00657 "movq 8(%0), " #out1 "\n"\ 00658 "add %2,%0\n"\ 00659 "psadbw " #out0 ", " #in0 "\n"\ 00660 "psadbw " #out1 ", " #in1 "\n"\ 00661 "paddw " #in1 ", " #in0 "\n"\ 00662 "paddw " #in0 ", %%mm6\n" 00663 00664 __asm__ volatile ( 00665 "movl %3,%%ecx\n" 00666 "pxor %%mm6,%%mm6\n" 00667 "pxor %%mm7,%%mm7\n" 00668 "movq (%0),%%mm0\n" 00669 "movq 8(%0),%%mm1\n" 00670 "add %2,%0\n" 00671 "jmp 2f\n" 00672 "1:\n" 00673 00674 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00675 "2:\n" 00676 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00677 00678 "subl $2, %%ecx\n" 00679 "jnz 1b\n" 00680 00681 "movd %%mm6,%1\n" 00682 : "+r" (pix), "=r"(tmp) 00683 : "r" ((x86_reg)line_size) , "m" (h) 00684 : "%ecx"); 00685 return tmp; 00686 } 00687 #undef SUM 00688 00689 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00690 int tmp; 00691 00692 assert( (((int)pix1) & 7) == 0); 00693 assert( (((int)pix2) & 7) == 0); 00694 assert((line_size &7) ==0); 00695 00696 #define SUM(in0, in1, out0, out1) \ 00697 "movq (%0),%%mm2\n"\ 00698 "movq (%1)," #out0 "\n"\ 00699 "movq 8(%0),%%mm3\n"\ 00700 "movq 8(%1)," #out1 "\n"\ 00701 "add %3,%0\n"\ 00702 "add %3,%1\n"\ 00703 "psubb " #out0 ", %%mm2\n"\ 00704 "psubb " #out1 ", %%mm3\n"\ 00705 "pxor %%mm7, %%mm2\n"\ 00706 "pxor %%mm7, %%mm3\n"\ 00707 "movq %%mm2, " #out0 "\n"\ 00708 "movq %%mm3, " #out1 "\n"\ 00709 "psubusb " #in0 ", %%mm2\n"\ 00710 "psubusb " #in1 ", %%mm3\n"\ 00711 "psubusb " #out0 ", " #in0 "\n"\ 00712 "psubusb " #out1 ", " #in1 "\n"\ 00713 "por %%mm2, " #in0 "\n"\ 00714 "por %%mm3, " #in1 "\n"\ 00715 "movq " #in0 ", %%mm2\n"\ 00716 "movq " #in1 ", %%mm3\n"\ 00717 "punpcklbw %%mm7, " #in0 "\n"\ 00718 "punpcklbw %%mm7, " #in1 "\n"\ 00719 "punpckhbw %%mm7, %%mm2\n"\ 00720 "punpckhbw %%mm7, %%mm3\n"\ 00721 "paddw " #in1 ", " #in0 "\n"\ 00722 "paddw %%mm3, %%mm2\n"\ 00723 "paddw %%mm2, " #in0 "\n"\ 00724 "paddw " #in0 ", %%mm6\n" 00725 00726 00727 __asm__ volatile ( 00728 "movl %4,%%ecx\n" 00729 "pxor %%mm6,%%mm6\n" 00730 "pcmpeqw %%mm7,%%mm7\n" 00731 "psllw $15, %%mm7\n" 00732 "packsswb %%mm7, %%mm7\n" 00733 "movq (%0),%%mm0\n" 00734 "movq (%1),%%mm2\n" 00735 "movq 8(%0),%%mm1\n" 00736 "movq 8(%1),%%mm3\n" 00737 "add %3,%0\n" 00738 "add %3,%1\n" 00739 "psubb %%mm2, %%mm0\n" 00740 "psubb %%mm3, %%mm1\n" 00741 "pxor %%mm7, %%mm0\n" 00742 "pxor %%mm7, %%mm1\n" 00743 "jmp 2f\n" 00744 "1:\n" 00745 00746 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00747 "2:\n" 00748 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00749 00750 "subl $2, %%ecx\n" 00751 "jnz 1b\n" 00752 00753 "movq %%mm6,%%mm0\n" 00754 "psrlq $32, %%mm6\n" 00755 "paddw %%mm6,%%mm0\n" 00756 "movq %%mm0,%%mm6\n" 00757 "psrlq $16, %%mm0\n" 00758 "paddw %%mm6,%%mm0\n" 00759 "movd %%mm0,%2\n" 00760 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00761 : "r" ((x86_reg)line_size) , "m" (h) 00762 : "%ecx"); 00763 return tmp & 0x7FFF; 00764 } 00765 #undef SUM 00766 00767 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00768 int tmp; 00769 00770 assert( (((int)pix1) & 7) == 0); 00771 assert( (((int)pix2) & 7) == 0); 00772 assert((line_size &7) ==0); 00773 00774 #define SUM(in0, in1, out0, out1) \ 00775 "movq (%0)," #out0 "\n"\ 00776 "movq (%1),%%mm2\n"\ 00777 "movq 8(%0)," #out1 "\n"\ 00778 "movq 8(%1),%%mm3\n"\ 00779 "add %3,%0\n"\ 00780 "add %3,%1\n"\ 00781 "psubb %%mm2, " #out0 "\n"\ 00782 "psubb %%mm3, " #out1 "\n"\ 00783 "pxor %%mm7, " #out0 "\n"\ 00784 "pxor %%mm7, " #out1 "\n"\ 00785 "psadbw " #out0 ", " #in0 "\n"\ 00786 "psadbw " #out1 ", " #in1 "\n"\ 00787 "paddw " #in1 ", " #in0 "\n"\ 00788 "paddw " #in0 ", %%mm6\n" 00789 00790 __asm__ volatile ( 00791 "movl %4,%%ecx\n" 00792 "pxor %%mm6,%%mm6\n" 00793 "pcmpeqw %%mm7,%%mm7\n" 00794 "psllw $15, %%mm7\n" 00795 "packsswb %%mm7, %%mm7\n" 00796 "movq (%0),%%mm0\n" 00797 "movq (%1),%%mm2\n" 00798 "movq 8(%0),%%mm1\n" 00799 "movq 8(%1),%%mm3\n" 00800 "add %3,%0\n" 00801 "add %3,%1\n" 00802 "psubb %%mm2, %%mm0\n" 00803 "psubb %%mm3, %%mm1\n" 00804 "pxor %%mm7, %%mm0\n" 00805 "pxor %%mm7, %%mm1\n" 00806 "jmp 2f\n" 00807 "1:\n" 00808 00809 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00810 "2:\n" 00811 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00812 00813 "subl $2, %%ecx\n" 00814 "jnz 1b\n" 00815 00816 "movd %%mm6,%2\n" 00817 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00818 : "r" ((x86_reg)line_size) , "m" (h) 00819 : "%ecx"); 00820 return tmp; 00821 } 00822 #undef SUM 00823 00824 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 00825 x86_reg i=0; 00826 __asm__ volatile( 00827 "1: \n\t" 00828 "movq (%2, %0), %%mm0 \n\t" 00829 "movq (%1, %0), %%mm1 \n\t" 00830 "psubb %%mm0, %%mm1 \n\t" 00831 "movq %%mm1, (%3, %0) \n\t" 00832 "movq 8(%2, %0), %%mm0 \n\t" 00833 "movq 8(%1, %0), %%mm1 \n\t" 00834 "psubb %%mm0, %%mm1 \n\t" 00835 "movq %%mm1, 8(%3, %0) \n\t" 00836 "add $16, %0 \n\t" 00837 "cmp %4, %0 \n\t" 00838 " jb 1b \n\t" 00839 : "+r" (i) 00840 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) 00841 ); 00842 for(; i<w; i++) 00843 dst[i+0] = src1[i+0]-src2[i+0]; 00844 } 00845 00846 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ 00847 x86_reg i=0; 00848 uint8_t l, lt; 00849 00850 __asm__ volatile( 00851 "1: \n\t" 00852 "movq -1(%1, %0), %%mm0 \n\t" // LT 00853 "movq (%1, %0), %%mm1 \n\t" // T 00854 "movq -1(%2, %0), %%mm2 \n\t" // L 00855 "movq (%2, %0), %%mm3 \n\t" // X 00856 "movq %%mm2, %%mm4 \n\t" // L 00857 "psubb %%mm0, %%mm2 \n\t" 00858 "paddb %%mm1, %%mm2 \n\t" // L + T - LT 00859 "movq %%mm4, %%mm5 \n\t" // L 00860 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 00861 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 00862 "pminub %%mm2, %%mm4 \n\t" 00863 "pmaxub %%mm1, %%mm4 \n\t" 00864 "psubb %%mm4, %%mm3 \n\t" // dst - pred 00865 "movq %%mm3, (%3, %0) \n\t" 00866 "add $8, %0 \n\t" 00867 "cmp %4, %0 \n\t" 00868 " jb 1b \n\t" 00869 : "+r" (i) 00870 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) 00871 ); 00872 00873 l= *left; 00874 lt= *left_top; 00875 00876 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); 00877 00878 *left_top= src1[w-1]; 00879 *left = src2[w-1]; 00880 } 00881 00882 #define MMABS_MMX(a,z)\ 00883 "pxor " #z ", " #z " \n\t"\ 00884 "pcmpgtw " #a ", " #z " \n\t"\ 00885 "pxor " #z ", " #a " \n\t"\ 00886 "psubw " #z ", " #a " \n\t" 00887 00888 #define MMABS_MMX2(a,z)\ 00889 "pxor " #z ", " #z " \n\t"\ 00890 "psubw " #a ", " #z " \n\t"\ 00891 "pmaxsw " #z ", " #a " \n\t" 00892 00893 #define MMABS_SSSE3(a,z)\ 00894 "pabsw " #a ", " #a " \n\t" 00895 00896 #define MMABS_SUM(a,z, sum)\ 00897 MMABS(a,z)\ 00898 "paddusw " #a ", " #sum " \n\t" 00899 00900 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 00901 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, 00902 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 00903 #define HSUM_MMX(a, t, dst)\ 00904 "movq "#a", "#t" \n\t"\ 00905 "psrlq $32, "#a" \n\t"\ 00906 "paddusw "#t", "#a" \n\t"\ 00907 "movq "#a", "#t" \n\t"\ 00908 "psrlq $16, "#a" \n\t"\ 00909 "paddusw "#t", "#a" \n\t"\ 00910 "movd "#a", "#dst" \n\t"\ 00911 00912 #define HSUM_MMX2(a, t, dst)\ 00913 "pshufw $0x0E, "#a", "#t" \n\t"\ 00914 "paddusw "#t", "#a" \n\t"\ 00915 "pshufw $0x01, "#a", "#t" \n\t"\ 00916 "paddusw "#t", "#a" \n\t"\ 00917 "movd "#a", "#dst" \n\t"\ 00918 00919 #define HSUM_SSE2(a, t, dst)\ 00920 "movhlps "#a", "#t" \n\t"\ 00921 "paddusw "#t", "#a" \n\t"\ 00922 "pshuflw $0x0E, "#a", "#t" \n\t"\ 00923 "paddusw "#t", "#a" \n\t"\ 00924 "pshuflw $0x01, "#a", "#t" \n\t"\ 00925 "paddusw "#t", "#a" \n\t"\ 00926 "movd "#a", "#dst" \n\t"\ 00927 00928 #define hadamard_func(cpu) \ 00929 int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ 00930 int stride, int h); \ 00931 int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ 00932 int stride, int h); 00933 00934 hadamard_func(mmx) 00935 hadamard_func(mmx2) 00936 hadamard_func(sse2) 00937 hadamard_func(ssse3) 00938 00939 #define DCT_SAD4(m,mm,o)\ 00940 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 00941 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 00942 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 00943 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 00944 MMABS_SUM(mm##2, mm##6, mm##0)\ 00945 MMABS_SUM(mm##3, mm##7, mm##1)\ 00946 MMABS_SUM(mm##4, mm##6, mm##0)\ 00947 MMABS_SUM(mm##5, mm##7, mm##1)\ 00948 00949 #define DCT_SAD_MMX\ 00950 "pxor %%mm0, %%mm0 \n\t"\ 00951 "pxor %%mm1, %%mm1 \n\t"\ 00952 DCT_SAD4(q, %%mm, 0)\ 00953 DCT_SAD4(q, %%mm, 8)\ 00954 DCT_SAD4(q, %%mm, 64)\ 00955 DCT_SAD4(q, %%mm, 72)\ 00956 "paddusw %%mm1, %%mm0 \n\t"\ 00957 HSUM(%%mm0, %%mm1, %0) 00958 00959 #define DCT_SAD_SSE2\ 00960 "pxor %%xmm0, %%xmm0 \n\t"\ 00961 "pxor %%xmm1, %%xmm1 \n\t"\ 00962 DCT_SAD4(dqa, %%xmm, 0)\ 00963 DCT_SAD4(dqa, %%xmm, 64)\ 00964 "paddusw %%xmm1, %%xmm0 \n\t"\ 00965 HSUM(%%xmm0, %%xmm1, %0) 00966 00967 #define DCT_SAD_FUNC(cpu) \ 00968 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 00969 int sum;\ 00970 __asm__ volatile(\ 00971 DCT_SAD\ 00972 :"=r"(sum)\ 00973 :"r"(block)\ 00974 );\ 00975 return sum&0xFFFF;\ 00976 } 00977 00978 #define DCT_SAD DCT_SAD_MMX 00979 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 00980 #define MMABS(a,z) MMABS_MMX(a,z) 00981 DCT_SAD_FUNC(mmx) 00982 #undef MMABS 00983 #undef HSUM 00984 00985 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 00986 #define MMABS(a,z) MMABS_MMX2(a,z) 00987 DCT_SAD_FUNC(mmx2) 00988 #undef HSUM 00989 #undef DCT_SAD 00990 00991 #define DCT_SAD DCT_SAD_SSE2 00992 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) 00993 DCT_SAD_FUNC(sse2) 00994 #undef MMABS 00995 00996 #if HAVE_SSSE3 00997 #define MMABS(a,z) MMABS_SSSE3(a,z) 00998 DCT_SAD_FUNC(ssse3) 00999 #undef MMABS 01000 #endif 01001 #undef HSUM 01002 #undef DCT_SAD 01003 01004 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 01005 int sum; 01006 x86_reg i=size; 01007 __asm__ volatile( 01008 "pxor %%mm4, %%mm4 \n" 01009 "1: \n" 01010 "sub $8, %0 \n" 01011 "movq (%2,%0), %%mm2 \n" 01012 "movq (%3,%0,2), %%mm0 \n" 01013 "movq 8(%3,%0,2), %%mm1 \n" 01014 "punpckhbw %%mm2, %%mm3 \n" 01015 "punpcklbw %%mm2, %%mm2 \n" 01016 "psraw $8, %%mm3 \n" 01017 "psraw $8, %%mm2 \n" 01018 "psubw %%mm3, %%mm1 \n" 01019 "psubw %%mm2, %%mm0 \n" 01020 "pmaddwd %%mm1, %%mm1 \n" 01021 "pmaddwd %%mm0, %%mm0 \n" 01022 "paddd %%mm1, %%mm4 \n" 01023 "paddd %%mm0, %%mm4 \n" 01024 "jg 1b \n" 01025 "movq %%mm4, %%mm3 \n" 01026 "psrlq $32, %%mm3 \n" 01027 "paddd %%mm3, %%mm4 \n" 01028 "movd %%mm4, %1 \n" 01029 :"+r"(i), "=r"(sum) 01030 :"r"(pix1), "r"(pix2) 01031 ); 01032 return sum; 01033 } 01034 01035 #define PHADDD(a, t)\ 01036 "movq "#a", "#t" \n\t"\ 01037 "psrlq $32, "#a" \n\t"\ 01038 "paddd "#t", "#a" \n\t" 01039 /* 01040 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] 01041 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] 01042 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] 01043 */ 01044 #define PMULHRW(x, y, s, o)\ 01045 "pmulhw " #s ", "#x " \n\t"\ 01046 "pmulhw " #s ", "#y " \n\t"\ 01047 "paddw " #o ", "#x " \n\t"\ 01048 "paddw " #o ", "#y " \n\t"\ 01049 "psraw $1, "#x " \n\t"\ 01050 "psraw $1, "#y " \n\t" 01051 #define DEF(x) x ## _mmx 01052 #define SET_RND MOVQ_WONE 01053 #define SCALE_OFFSET 1 01054 01055 #include "dsputil_mmx_qns_template.c" 01056 01057 #undef DEF 01058 #undef SET_RND 01059 #undef SCALE_OFFSET 01060 #undef PMULHRW 01061 01062 #define DEF(x) x ## _3dnow 01063 #define SET_RND(x) 01064 #define SCALE_OFFSET 0 01065 #define PMULHRW(x, y, s, o)\ 01066 "pmulhrw " #s ", "#x " \n\t"\ 01067 "pmulhrw " #s ", "#y " \n\t" 01068 01069 #include "dsputil_mmx_qns_template.c" 01070 01071 #undef DEF 01072 #undef SET_RND 01073 #undef SCALE_OFFSET 01074 #undef PMULHRW 01075 01076 #if HAVE_SSSE3 01077 #undef PHADDD 01078 #define DEF(x) x ## _ssse3 01079 #define SET_RND(x) 01080 #define SCALE_OFFSET -1 01081 #define PHADDD(a, t)\ 01082 "pshufw $0x0E, "#a", "#t" \n\t"\ 01083 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 01084 #define PMULHRW(x, y, s, o)\ 01085 "pmulhrsw " #s ", "#x " \n\t"\ 01086 "pmulhrsw " #s ", "#y " \n\t" 01087 01088 #include "dsputil_mmx_qns_template.c" 01089 01090 #undef DEF 01091 #undef SET_RND 01092 #undef SCALE_OFFSET 01093 #undef PMULHRW 01094 #undef PHADDD 01095 #endif //HAVE_SSSE3 01096 01097 01098 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) 01099 { 01100 int mm_flags = av_get_cpu_flags(); 01101 01102 if (mm_flags & AV_CPU_FLAG_MMX) { 01103 const int dct_algo = avctx->dct_algo; 01104 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 01105 if(mm_flags & AV_CPU_FLAG_SSE2){ 01106 c->fdct = ff_fdct_sse2; 01107 }else if(mm_flags & AV_CPU_FLAG_MMX2){ 01108 c->fdct = ff_fdct_mmx2; 01109 }else{ 01110 c->fdct = ff_fdct_mmx; 01111 } 01112 } 01113 01114 c->get_pixels = get_pixels_mmx; 01115 c->diff_pixels = diff_pixels_mmx; 01116 c->pix_sum = pix_sum16_mmx; 01117 01118 c->diff_bytes= diff_bytes_mmx; 01119 c->sum_abs_dctelem= sum_abs_dctelem_mmx; 01120 01121 #if HAVE_YASM 01122 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx; 01123 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx; 01124 #endif 01125 01126 c->pix_norm1 = pix_norm1_mmx; 01127 c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; 01128 c->sse[1] = sse8_mmx; 01129 c->vsad[4]= vsad_intra16_mmx; 01130 01131 c->nsse[0] = nsse16_mmx; 01132 c->nsse[1] = nsse8_mmx; 01133 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01134 c->vsad[0] = vsad16_mmx; 01135 } 01136 01137 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01138 c->try_8x8basis= try_8x8basis_mmx; 01139 } 01140 c->add_8x8basis= add_8x8basis_mmx; 01141 01142 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 01143 01144 01145 if (mm_flags & AV_CPU_FLAG_MMX2) { 01146 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; 01147 #if HAVE_YASM 01148 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2; 01149 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2; 01150 #endif 01151 c->vsad[4]= vsad_intra16_mmx2; 01152 01153 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01154 c->vsad[0] = vsad16_mmx2; 01155 } 01156 01157 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 01158 } 01159 01160 if(mm_flags & AV_CPU_FLAG_SSE2){ 01161 c->get_pixels = get_pixels_sse2; 01162 c->sum_abs_dctelem= sum_abs_dctelem_sse2; 01163 #if HAVE_YASM && HAVE_ALIGNED_STACK 01164 c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2; 01165 c->hadamard8_diff[1]= ff_hadamard8_diff_sse2; 01166 #endif 01167 } 01168 01169 #if HAVE_SSSE3 01170 if(mm_flags & AV_CPU_FLAG_SSSE3){ 01171 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01172 c->try_8x8basis= try_8x8basis_ssse3; 01173 } 01174 c->add_8x8basis= add_8x8basis_ssse3; 01175 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; 01176 #if HAVE_YASM && HAVE_ALIGNED_STACK 01177 c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3; 01178 c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3; 01179 #endif 01180 } 01181 #endif 01182 01183 if(mm_flags & AV_CPU_FLAG_3DNOW){ 01184 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01185 c->try_8x8basis= try_8x8basis_3dnow; 01186 } 01187 c->add_8x8basis= add_8x8basis_3dnow; 01188 } 01189 } 01190 01191 dsputil_init_pix_mmx(c, avctx); 01192 }