Libav
|
00001 /* 00002 * MMX optimized DSP utils 00003 * Copyright (c) 2000, 2001 Fabrice Bellard 00004 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 00005 * 00006 * This file is part of FFmpeg. 00007 * 00008 * FFmpeg is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * FFmpeg is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with FFmpeg; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 * 00022 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 00023 */ 00024 00025 #include "libavutil/x86_cpu.h" 00026 #include "libavcodec/dsputil.h" 00027 #include "libavcodec/mpegvideo.h" 00028 #include "libavcodec/mathops.h" 00029 #include "dsputil_mmx.h" 00030 00031 00032 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 00033 { 00034 __asm__ volatile( 00035 "mov $-128, %%"REG_a" \n\t" 00036 "pxor %%mm7, %%mm7 \n\t" 00037 ASMALIGN(4) 00038 "1: \n\t" 00039 "movq (%0), %%mm0 \n\t" 00040 "movq (%0, %2), %%mm2 \n\t" 00041 "movq %%mm0, %%mm1 \n\t" 00042 "movq %%mm2, %%mm3 \n\t" 00043 "punpcklbw %%mm7, %%mm0 \n\t" 00044 "punpckhbw %%mm7, %%mm1 \n\t" 00045 "punpcklbw %%mm7, %%mm2 \n\t" 00046 "punpckhbw %%mm7, %%mm3 \n\t" 00047 "movq %%mm0, (%1, %%"REG_a") \n\t" 00048 "movq %%mm1, 8(%1, %%"REG_a") \n\t" 00049 "movq %%mm2, 16(%1, %%"REG_a") \n\t" 00050 "movq %%mm3, 24(%1, %%"REG_a") \n\t" 00051 "add %3, %0 \n\t" 00052 "add $32, %%"REG_a" \n\t" 00053 "js 1b \n\t" 00054 : "+r" (pixels) 00055 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) 00056 : "%"REG_a 00057 ); 00058 } 00059 00060 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) 00061 { 00062 __asm__ volatile( 00063 "pxor %%xmm7, %%xmm7 \n\t" 00064 "movq (%0), %%xmm0 \n\t" 00065 "movq (%0, %2), %%xmm1 \n\t" 00066 "movq (%0, %2,2), %%xmm2 \n\t" 00067 "movq (%0, %3), %%xmm3 \n\t" 00068 "lea (%0,%2,4), %0 \n\t" 00069 "punpcklbw %%xmm7, %%xmm0 \n\t" 00070 "punpcklbw %%xmm7, %%xmm1 \n\t" 00071 "punpcklbw %%xmm7, %%xmm2 \n\t" 00072 "punpcklbw %%xmm7, %%xmm3 \n\t" 00073 "movdqa %%xmm0, (%1) \n\t" 00074 "movdqa %%xmm1, 16(%1) \n\t" 00075 "movdqa %%xmm2, 32(%1) \n\t" 00076 "movdqa %%xmm3, 48(%1) \n\t" 00077 "movq (%0), %%xmm0 \n\t" 00078 "movq (%0, %2), %%xmm1 \n\t" 00079 "movq (%0, %2,2), %%xmm2 \n\t" 00080 "movq (%0, %3), %%xmm3 \n\t" 00081 "punpcklbw %%xmm7, %%xmm0 \n\t" 00082 "punpcklbw %%xmm7, %%xmm1 \n\t" 00083 "punpcklbw %%xmm7, %%xmm2 \n\t" 00084 "punpcklbw %%xmm7, %%xmm3 \n\t" 00085 "movdqa %%xmm0, 64(%1) \n\t" 00086 "movdqa %%xmm1, 80(%1) \n\t" 00087 "movdqa %%xmm2, 96(%1) \n\t" 00088 "movdqa %%xmm3, 112(%1) \n\t" 00089 : "+r" (pixels) 00090 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) 00091 ); 00092 } 00093 00094 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 00095 { 00096 __asm__ volatile( 00097 "pxor %%mm7, %%mm7 \n\t" 00098 "mov $-128, %%"REG_a" \n\t" 00099 ASMALIGN(4) 00100 "1: \n\t" 00101 "movq (%0), %%mm0 \n\t" 00102 "movq (%1), %%mm2 \n\t" 00103 "movq %%mm0, %%mm1 \n\t" 00104 "movq %%mm2, %%mm3 \n\t" 00105 "punpcklbw %%mm7, %%mm0 \n\t" 00106 "punpckhbw %%mm7, %%mm1 \n\t" 00107 "punpcklbw %%mm7, %%mm2 \n\t" 00108 "punpckhbw %%mm7, %%mm3 \n\t" 00109 "psubw %%mm2, %%mm0 \n\t" 00110 "psubw %%mm3, %%mm1 \n\t" 00111 "movq %%mm0, (%2, %%"REG_a") \n\t" 00112 "movq %%mm1, 8(%2, %%"REG_a") \n\t" 00113 "add %3, %0 \n\t" 00114 "add %3, %1 \n\t" 00115 "add $16, %%"REG_a" \n\t" 00116 "jnz 1b \n\t" 00117 : "+r" (s1), "+r" (s2) 00118 : "r" (block+64), "r" ((x86_reg)stride) 00119 : "%"REG_a 00120 ); 00121 } 00122 00123 static int pix_sum16_mmx(uint8_t * pix, int line_size){ 00124 const int h=16; 00125 int sum; 00126 x86_reg index= -line_size*h; 00127 00128 __asm__ volatile( 00129 "pxor %%mm7, %%mm7 \n\t" 00130 "pxor %%mm6, %%mm6 \n\t" 00131 "1: \n\t" 00132 "movq (%2, %1), %%mm0 \n\t" 00133 "movq (%2, %1), %%mm1 \n\t" 00134 "movq 8(%2, %1), %%mm2 \n\t" 00135 "movq 8(%2, %1), %%mm3 \n\t" 00136 "punpcklbw %%mm7, %%mm0 \n\t" 00137 "punpckhbw %%mm7, %%mm1 \n\t" 00138 "punpcklbw %%mm7, %%mm2 \n\t" 00139 "punpckhbw %%mm7, %%mm3 \n\t" 00140 "paddw %%mm0, %%mm1 \n\t" 00141 "paddw %%mm2, %%mm3 \n\t" 00142 "paddw %%mm1, %%mm3 \n\t" 00143 "paddw %%mm3, %%mm6 \n\t" 00144 "add %3, %1 \n\t" 00145 " js 1b \n\t" 00146 "movq %%mm6, %%mm5 \n\t" 00147 "psrlq $32, %%mm6 \n\t" 00148 "paddw %%mm5, %%mm6 \n\t" 00149 "movq %%mm6, %%mm5 \n\t" 00150 "psrlq $16, %%mm6 \n\t" 00151 "paddw %%mm5, %%mm6 \n\t" 00152 "movd %%mm6, %0 \n\t" 00153 "andl $0xFFFF, %0 \n\t" 00154 : "=&r" (sum), "+r" (index) 00155 : "r" (pix - index), "r" ((x86_reg)line_size) 00156 ); 00157 00158 return sum; 00159 } 00160 00161 static int pix_norm1_mmx(uint8_t *pix, int line_size) { 00162 int tmp; 00163 __asm__ volatile ( 00164 "movl $16,%%ecx\n" 00165 "pxor %%mm0,%%mm0\n" 00166 "pxor %%mm7,%%mm7\n" 00167 "1:\n" 00168 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ 00169 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ 00170 00171 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ 00172 00173 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ 00174 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ 00175 00176 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ 00177 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ 00178 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ 00179 00180 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 00181 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 00182 00183 "pmaddwd %%mm3,%%mm3\n" 00184 "pmaddwd %%mm4,%%mm4\n" 00185 00186 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 00187 pix2^2+pix3^2+pix6^2+pix7^2) */ 00188 "paddd %%mm3,%%mm4\n" 00189 "paddd %%mm2,%%mm7\n" 00190 00191 "add %2, %0\n" 00192 "paddd %%mm4,%%mm7\n" 00193 "dec %%ecx\n" 00194 "jnz 1b\n" 00195 00196 "movq %%mm7,%%mm1\n" 00197 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 00198 "paddd %%mm7,%%mm1\n" 00199 "movd %%mm1,%1\n" 00200 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); 00201 return tmp; 00202 } 00203 00204 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00205 int tmp; 00206 __asm__ volatile ( 00207 "movl %4,%%ecx\n" 00208 "shr $1,%%ecx\n" 00209 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 00210 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 00211 "1:\n" 00212 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ 00213 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ 00214 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ 00215 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ 00216 00217 /* todo: mm1-mm2, mm3-mm4 */ 00218 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 00219 /* OR the results to get absolute difference */ 00220 "movq %%mm1,%%mm5\n" 00221 "movq %%mm3,%%mm6\n" 00222 "psubusb %%mm2,%%mm1\n" 00223 "psubusb %%mm4,%%mm3\n" 00224 "psubusb %%mm5,%%mm2\n" 00225 "psubusb %%mm6,%%mm4\n" 00226 00227 "por %%mm1,%%mm2\n" 00228 "por %%mm3,%%mm4\n" 00229 00230 /* now convert to 16-bit vectors so we can square them */ 00231 "movq %%mm2,%%mm1\n" 00232 "movq %%mm4,%%mm3\n" 00233 00234 "punpckhbw %%mm0,%%mm2\n" 00235 "punpckhbw %%mm0,%%mm4\n" 00236 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 00237 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 00238 00239 "pmaddwd %%mm2,%%mm2\n" 00240 "pmaddwd %%mm4,%%mm4\n" 00241 "pmaddwd %%mm1,%%mm1\n" 00242 "pmaddwd %%mm3,%%mm3\n" 00243 00244 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 00245 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 00246 00247 "paddd %%mm2,%%mm1\n" 00248 "paddd %%mm4,%%mm3\n" 00249 "paddd %%mm1,%%mm7\n" 00250 "paddd %%mm3,%%mm7\n" 00251 00252 "decl %%ecx\n" 00253 "jnz 1b\n" 00254 00255 "movq %%mm7,%%mm1\n" 00256 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 00257 "paddd %%mm7,%%mm1\n" 00258 "movd %%mm1,%2\n" 00259 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00260 : "r" ((x86_reg)line_size) , "m" (h) 00261 : "%ecx"); 00262 return tmp; 00263 } 00264 00265 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00266 int tmp; 00267 __asm__ volatile ( 00268 "movl %4,%%ecx\n" 00269 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 00270 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 00271 "1:\n" 00272 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ 00273 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ 00274 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ 00275 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ 00276 00277 /* todo: mm1-mm2, mm3-mm4 */ 00278 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 00279 /* OR the results to get absolute difference */ 00280 "movq %%mm1,%%mm5\n" 00281 "movq %%mm3,%%mm6\n" 00282 "psubusb %%mm2,%%mm1\n" 00283 "psubusb %%mm4,%%mm3\n" 00284 "psubusb %%mm5,%%mm2\n" 00285 "psubusb %%mm6,%%mm4\n" 00286 00287 "por %%mm1,%%mm2\n" 00288 "por %%mm3,%%mm4\n" 00289 00290 /* now convert to 16-bit vectors so we can square them */ 00291 "movq %%mm2,%%mm1\n" 00292 "movq %%mm4,%%mm3\n" 00293 00294 "punpckhbw %%mm0,%%mm2\n" 00295 "punpckhbw %%mm0,%%mm4\n" 00296 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 00297 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 00298 00299 "pmaddwd %%mm2,%%mm2\n" 00300 "pmaddwd %%mm4,%%mm4\n" 00301 "pmaddwd %%mm1,%%mm1\n" 00302 "pmaddwd %%mm3,%%mm3\n" 00303 00304 "add %3,%0\n" 00305 "add %3,%1\n" 00306 00307 "paddd %%mm2,%%mm1\n" 00308 "paddd %%mm4,%%mm3\n" 00309 "paddd %%mm1,%%mm7\n" 00310 "paddd %%mm3,%%mm7\n" 00311 00312 "decl %%ecx\n" 00313 "jnz 1b\n" 00314 00315 "movq %%mm7,%%mm1\n" 00316 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 00317 "paddd %%mm7,%%mm1\n" 00318 "movd %%mm1,%2\n" 00319 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00320 : "r" ((x86_reg)line_size) , "m" (h) 00321 : "%ecx"); 00322 return tmp; 00323 } 00324 00325 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00326 int tmp; 00327 __asm__ volatile ( 00328 "shr $1,%2\n" 00329 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ 00330 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ 00331 "1:\n" 00332 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ 00333 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ 00334 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ 00335 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ 00336 00337 /* todo: mm1-mm2, mm3-mm4 */ 00338 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 00339 /* OR the results to get absolute difference */ 00340 "movdqa %%xmm1,%%xmm5\n" 00341 "movdqa %%xmm3,%%xmm6\n" 00342 "psubusb %%xmm2,%%xmm1\n" 00343 "psubusb %%xmm4,%%xmm3\n" 00344 "psubusb %%xmm5,%%xmm2\n" 00345 "psubusb %%xmm6,%%xmm4\n" 00346 00347 "por %%xmm1,%%xmm2\n" 00348 "por %%xmm3,%%xmm4\n" 00349 00350 /* now convert to 16-bit vectors so we can square them */ 00351 "movdqa %%xmm2,%%xmm1\n" 00352 "movdqa %%xmm4,%%xmm3\n" 00353 00354 "punpckhbw %%xmm0,%%xmm2\n" 00355 "punpckhbw %%xmm0,%%xmm4\n" 00356 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ 00357 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ 00358 00359 "pmaddwd %%xmm2,%%xmm2\n" 00360 "pmaddwd %%xmm4,%%xmm4\n" 00361 "pmaddwd %%xmm1,%%xmm1\n" 00362 "pmaddwd %%xmm3,%%xmm3\n" 00363 00364 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ 00365 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ 00366 00367 "paddd %%xmm2,%%xmm1\n" 00368 "paddd %%xmm4,%%xmm3\n" 00369 "paddd %%xmm1,%%xmm7\n" 00370 "paddd %%xmm3,%%xmm7\n" 00371 00372 "decl %2\n" 00373 "jnz 1b\n" 00374 00375 "movdqa %%xmm7,%%xmm1\n" 00376 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ 00377 "paddd %%xmm1,%%xmm7\n" 00378 "movdqa %%xmm7,%%xmm1\n" 00379 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 00380 "paddd %%xmm1,%%xmm7\n" 00381 "movd %%xmm7,%3\n" 00382 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 00383 : "r" ((x86_reg)line_size)); 00384 return tmp; 00385 } 00386 00387 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 00388 int tmp; 00389 __asm__ volatile ( 00390 "movl %3,%%ecx\n" 00391 "pxor %%mm7,%%mm7\n" 00392 "pxor %%mm6,%%mm6\n" 00393 00394 "movq (%0),%%mm0\n" 00395 "movq %%mm0, %%mm1\n" 00396 "psllq $8, %%mm0\n" 00397 "psrlq $8, %%mm1\n" 00398 "psrlq $8, %%mm0\n" 00399 "movq %%mm0, %%mm2\n" 00400 "movq %%mm1, %%mm3\n" 00401 "punpcklbw %%mm7,%%mm0\n" 00402 "punpcklbw %%mm7,%%mm1\n" 00403 "punpckhbw %%mm7,%%mm2\n" 00404 "punpckhbw %%mm7,%%mm3\n" 00405 "psubw %%mm1, %%mm0\n" 00406 "psubw %%mm3, %%mm2\n" 00407 00408 "add %2,%0\n" 00409 00410 "movq (%0),%%mm4\n" 00411 "movq %%mm4, %%mm1\n" 00412 "psllq $8, %%mm4\n" 00413 "psrlq $8, %%mm1\n" 00414 "psrlq $8, %%mm4\n" 00415 "movq %%mm4, %%mm5\n" 00416 "movq %%mm1, %%mm3\n" 00417 "punpcklbw %%mm7,%%mm4\n" 00418 "punpcklbw %%mm7,%%mm1\n" 00419 "punpckhbw %%mm7,%%mm5\n" 00420 "punpckhbw %%mm7,%%mm3\n" 00421 "psubw %%mm1, %%mm4\n" 00422 "psubw %%mm3, %%mm5\n" 00423 "psubw %%mm4, %%mm0\n" 00424 "psubw %%mm5, %%mm2\n" 00425 "pxor %%mm3, %%mm3\n" 00426 "pxor %%mm1, %%mm1\n" 00427 "pcmpgtw %%mm0, %%mm3\n\t" 00428 "pcmpgtw %%mm2, %%mm1\n\t" 00429 "pxor %%mm3, %%mm0\n" 00430 "pxor %%mm1, %%mm2\n" 00431 "psubw %%mm3, %%mm0\n" 00432 "psubw %%mm1, %%mm2\n" 00433 "paddw %%mm0, %%mm2\n" 00434 "paddw %%mm2, %%mm6\n" 00435 00436 "add %2,%0\n" 00437 "1:\n" 00438 00439 "movq (%0),%%mm0\n" 00440 "movq %%mm0, %%mm1\n" 00441 "psllq $8, %%mm0\n" 00442 "psrlq $8, %%mm1\n" 00443 "psrlq $8, %%mm0\n" 00444 "movq %%mm0, %%mm2\n" 00445 "movq %%mm1, %%mm3\n" 00446 "punpcklbw %%mm7,%%mm0\n" 00447 "punpcklbw %%mm7,%%mm1\n" 00448 "punpckhbw %%mm7,%%mm2\n" 00449 "punpckhbw %%mm7,%%mm3\n" 00450 "psubw %%mm1, %%mm0\n" 00451 "psubw %%mm3, %%mm2\n" 00452 "psubw %%mm0, %%mm4\n" 00453 "psubw %%mm2, %%mm5\n" 00454 "pxor %%mm3, %%mm3\n" 00455 "pxor %%mm1, %%mm1\n" 00456 "pcmpgtw %%mm4, %%mm3\n\t" 00457 "pcmpgtw %%mm5, %%mm1\n\t" 00458 "pxor %%mm3, %%mm4\n" 00459 "pxor %%mm1, %%mm5\n" 00460 "psubw %%mm3, %%mm4\n" 00461 "psubw %%mm1, %%mm5\n" 00462 "paddw %%mm4, %%mm5\n" 00463 "paddw %%mm5, %%mm6\n" 00464 00465 "add %2,%0\n" 00466 00467 "movq (%0),%%mm4\n" 00468 "movq %%mm4, %%mm1\n" 00469 "psllq $8, %%mm4\n" 00470 "psrlq $8, %%mm1\n" 00471 "psrlq $8, %%mm4\n" 00472 "movq %%mm4, %%mm5\n" 00473 "movq %%mm1, %%mm3\n" 00474 "punpcklbw %%mm7,%%mm4\n" 00475 "punpcklbw %%mm7,%%mm1\n" 00476 "punpckhbw %%mm7,%%mm5\n" 00477 "punpckhbw %%mm7,%%mm3\n" 00478 "psubw %%mm1, %%mm4\n" 00479 "psubw %%mm3, %%mm5\n" 00480 "psubw %%mm4, %%mm0\n" 00481 "psubw %%mm5, %%mm2\n" 00482 "pxor %%mm3, %%mm3\n" 00483 "pxor %%mm1, %%mm1\n" 00484 "pcmpgtw %%mm0, %%mm3\n\t" 00485 "pcmpgtw %%mm2, %%mm1\n\t" 00486 "pxor %%mm3, %%mm0\n" 00487 "pxor %%mm1, %%mm2\n" 00488 "psubw %%mm3, %%mm0\n" 00489 "psubw %%mm1, %%mm2\n" 00490 "paddw %%mm0, %%mm2\n" 00491 "paddw %%mm2, %%mm6\n" 00492 00493 "add %2,%0\n" 00494 "subl $2, %%ecx\n" 00495 " jnz 1b\n" 00496 00497 "movq %%mm6, %%mm0\n" 00498 "punpcklwd %%mm7,%%mm0\n" 00499 "punpckhwd %%mm7,%%mm6\n" 00500 "paddd %%mm0, %%mm6\n" 00501 00502 "movq %%mm6,%%mm0\n" 00503 "psrlq $32, %%mm6\n" 00504 "paddd %%mm6,%%mm0\n" 00505 "movd %%mm0,%1\n" 00506 : "+r" (pix1), "=r"(tmp) 00507 : "r" ((x86_reg)line_size) , "g" (h-2) 00508 : "%ecx"); 00509 return tmp; 00510 } 00511 00512 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 00513 int tmp; 00514 uint8_t * pix= pix1; 00515 __asm__ volatile ( 00516 "movl %3,%%ecx\n" 00517 "pxor %%mm7,%%mm7\n" 00518 "pxor %%mm6,%%mm6\n" 00519 00520 "movq (%0),%%mm0\n" 00521 "movq 1(%0),%%mm1\n" 00522 "movq %%mm0, %%mm2\n" 00523 "movq %%mm1, %%mm3\n" 00524 "punpcklbw %%mm7,%%mm0\n" 00525 "punpcklbw %%mm7,%%mm1\n" 00526 "punpckhbw %%mm7,%%mm2\n" 00527 "punpckhbw %%mm7,%%mm3\n" 00528 "psubw %%mm1, %%mm0\n" 00529 "psubw %%mm3, %%mm2\n" 00530 00531 "add %2,%0\n" 00532 00533 "movq (%0),%%mm4\n" 00534 "movq 1(%0),%%mm1\n" 00535 "movq %%mm4, %%mm5\n" 00536 "movq %%mm1, %%mm3\n" 00537 "punpcklbw %%mm7,%%mm4\n" 00538 "punpcklbw %%mm7,%%mm1\n" 00539 "punpckhbw %%mm7,%%mm5\n" 00540 "punpckhbw %%mm7,%%mm3\n" 00541 "psubw %%mm1, %%mm4\n" 00542 "psubw %%mm3, %%mm5\n" 00543 "psubw %%mm4, %%mm0\n" 00544 "psubw %%mm5, %%mm2\n" 00545 "pxor %%mm3, %%mm3\n" 00546 "pxor %%mm1, %%mm1\n" 00547 "pcmpgtw %%mm0, %%mm3\n\t" 00548 "pcmpgtw %%mm2, %%mm1\n\t" 00549 "pxor %%mm3, %%mm0\n" 00550 "pxor %%mm1, %%mm2\n" 00551 "psubw %%mm3, %%mm0\n" 00552 "psubw %%mm1, %%mm2\n" 00553 "paddw %%mm0, %%mm2\n" 00554 "paddw %%mm2, %%mm6\n" 00555 00556 "add %2,%0\n" 00557 "1:\n" 00558 00559 "movq (%0),%%mm0\n" 00560 "movq 1(%0),%%mm1\n" 00561 "movq %%mm0, %%mm2\n" 00562 "movq %%mm1, %%mm3\n" 00563 "punpcklbw %%mm7,%%mm0\n" 00564 "punpcklbw %%mm7,%%mm1\n" 00565 "punpckhbw %%mm7,%%mm2\n" 00566 "punpckhbw %%mm7,%%mm3\n" 00567 "psubw %%mm1, %%mm0\n" 00568 "psubw %%mm3, %%mm2\n" 00569 "psubw %%mm0, %%mm4\n" 00570 "psubw %%mm2, %%mm5\n" 00571 "pxor %%mm3, %%mm3\n" 00572 "pxor %%mm1, %%mm1\n" 00573 "pcmpgtw %%mm4, %%mm3\n\t" 00574 "pcmpgtw %%mm5, %%mm1\n\t" 00575 "pxor %%mm3, %%mm4\n" 00576 "pxor %%mm1, %%mm5\n" 00577 "psubw %%mm3, %%mm4\n" 00578 "psubw %%mm1, %%mm5\n" 00579 "paddw %%mm4, %%mm5\n" 00580 "paddw %%mm5, %%mm6\n" 00581 00582 "add %2,%0\n" 00583 00584 "movq (%0),%%mm4\n" 00585 "movq 1(%0),%%mm1\n" 00586 "movq %%mm4, %%mm5\n" 00587 "movq %%mm1, %%mm3\n" 00588 "punpcklbw %%mm7,%%mm4\n" 00589 "punpcklbw %%mm7,%%mm1\n" 00590 "punpckhbw %%mm7,%%mm5\n" 00591 "punpckhbw %%mm7,%%mm3\n" 00592 "psubw %%mm1, %%mm4\n" 00593 "psubw %%mm3, %%mm5\n" 00594 "psubw %%mm4, %%mm0\n" 00595 "psubw %%mm5, %%mm2\n" 00596 "pxor %%mm3, %%mm3\n" 00597 "pxor %%mm1, %%mm1\n" 00598 "pcmpgtw %%mm0, %%mm3\n\t" 00599 "pcmpgtw %%mm2, %%mm1\n\t" 00600 "pxor %%mm3, %%mm0\n" 00601 "pxor %%mm1, %%mm2\n" 00602 "psubw %%mm3, %%mm0\n" 00603 "psubw %%mm1, %%mm2\n" 00604 "paddw %%mm0, %%mm2\n" 00605 "paddw %%mm2, %%mm6\n" 00606 00607 "add %2,%0\n" 00608 "subl $2, %%ecx\n" 00609 " jnz 1b\n" 00610 00611 "movq %%mm6, %%mm0\n" 00612 "punpcklwd %%mm7,%%mm0\n" 00613 "punpckhwd %%mm7,%%mm6\n" 00614 "paddd %%mm0, %%mm6\n" 00615 00616 "movq %%mm6,%%mm0\n" 00617 "psrlq $32, %%mm6\n" 00618 "paddd %%mm6,%%mm0\n" 00619 "movd %%mm0,%1\n" 00620 : "+r" (pix1), "=r"(tmp) 00621 : "r" ((x86_reg)line_size) , "g" (h-2) 00622 : "%ecx"); 00623 return tmp + hf_noise8_mmx(pix+8, line_size, h); 00624 } 00625 00626 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00627 MpegEncContext *c = p; 00628 int score1, score2; 00629 00630 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); 00631 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); 00632 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); 00633 00634 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 00635 else return score1 + FFABS(score2)*8; 00636 } 00637 00638 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00639 MpegEncContext *c = p; 00640 int score1= sse8_mmx(c, pix1, pix2, line_size, h); 00641 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); 00642 00643 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 00644 else return score1 + FFABS(score2)*8; 00645 } 00646 00647 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 00648 int tmp; 00649 00650 assert( (((int)pix) & 7) == 0); 00651 assert((line_size &7) ==0); 00652 00653 #define SUM(in0, in1, out0, out1) \ 00654 "movq (%0), %%mm2\n"\ 00655 "movq 8(%0), %%mm3\n"\ 00656 "add %2,%0\n"\ 00657 "movq %%mm2, " #out0 "\n"\ 00658 "movq %%mm3, " #out1 "\n"\ 00659 "psubusb " #in0 ", %%mm2\n"\ 00660 "psubusb " #in1 ", %%mm3\n"\ 00661 "psubusb " #out0 ", " #in0 "\n"\ 00662 "psubusb " #out1 ", " #in1 "\n"\ 00663 "por %%mm2, " #in0 "\n"\ 00664 "por %%mm3, " #in1 "\n"\ 00665 "movq " #in0 ", %%mm2\n"\ 00666 "movq " #in1 ", %%mm3\n"\ 00667 "punpcklbw %%mm7, " #in0 "\n"\ 00668 "punpcklbw %%mm7, " #in1 "\n"\ 00669 "punpckhbw %%mm7, %%mm2\n"\ 00670 "punpckhbw %%mm7, %%mm3\n"\ 00671 "paddw " #in1 ", " #in0 "\n"\ 00672 "paddw %%mm3, %%mm2\n"\ 00673 "paddw %%mm2, " #in0 "\n"\ 00674 "paddw " #in0 ", %%mm6\n" 00675 00676 00677 __asm__ volatile ( 00678 "movl %3,%%ecx\n" 00679 "pxor %%mm6,%%mm6\n" 00680 "pxor %%mm7,%%mm7\n" 00681 "movq (%0),%%mm0\n" 00682 "movq 8(%0),%%mm1\n" 00683 "add %2,%0\n" 00684 "jmp 2f\n" 00685 "1:\n" 00686 00687 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00688 "2:\n" 00689 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00690 00691 "subl $2, %%ecx\n" 00692 "jnz 1b\n" 00693 00694 "movq %%mm6,%%mm0\n" 00695 "psrlq $32, %%mm6\n" 00696 "paddw %%mm6,%%mm0\n" 00697 "movq %%mm0,%%mm6\n" 00698 "psrlq $16, %%mm0\n" 00699 "paddw %%mm6,%%mm0\n" 00700 "movd %%mm0,%1\n" 00701 : "+r" (pix), "=r"(tmp) 00702 : "r" ((x86_reg)line_size) , "m" (h) 00703 : "%ecx"); 00704 return tmp & 0xFFFF; 00705 } 00706 #undef SUM 00707 00708 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 00709 int tmp; 00710 00711 assert( (((int)pix) & 7) == 0); 00712 assert((line_size &7) ==0); 00713 00714 #define SUM(in0, in1, out0, out1) \ 00715 "movq (%0), " #out0 "\n"\ 00716 "movq 8(%0), " #out1 "\n"\ 00717 "add %2,%0\n"\ 00718 "psadbw " #out0 ", " #in0 "\n"\ 00719 "psadbw " #out1 ", " #in1 "\n"\ 00720 "paddw " #in1 ", " #in0 "\n"\ 00721 "paddw " #in0 ", %%mm6\n" 00722 00723 __asm__ volatile ( 00724 "movl %3,%%ecx\n" 00725 "pxor %%mm6,%%mm6\n" 00726 "pxor %%mm7,%%mm7\n" 00727 "movq (%0),%%mm0\n" 00728 "movq 8(%0),%%mm1\n" 00729 "add %2,%0\n" 00730 "jmp 2f\n" 00731 "1:\n" 00732 00733 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00734 "2:\n" 00735 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00736 00737 "subl $2, %%ecx\n" 00738 "jnz 1b\n" 00739 00740 "movd %%mm6,%1\n" 00741 : "+r" (pix), "=r"(tmp) 00742 : "r" ((x86_reg)line_size) , "m" (h) 00743 : "%ecx"); 00744 return tmp; 00745 } 00746 #undef SUM 00747 00748 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00749 int tmp; 00750 00751 assert( (((int)pix1) & 7) == 0); 00752 assert( (((int)pix2) & 7) == 0); 00753 assert((line_size &7) ==0); 00754 00755 #define SUM(in0, in1, out0, out1) \ 00756 "movq (%0),%%mm2\n"\ 00757 "movq (%1)," #out0 "\n"\ 00758 "movq 8(%0),%%mm3\n"\ 00759 "movq 8(%1)," #out1 "\n"\ 00760 "add %3,%0\n"\ 00761 "add %3,%1\n"\ 00762 "psubb " #out0 ", %%mm2\n"\ 00763 "psubb " #out1 ", %%mm3\n"\ 00764 "pxor %%mm7, %%mm2\n"\ 00765 "pxor %%mm7, %%mm3\n"\ 00766 "movq %%mm2, " #out0 "\n"\ 00767 "movq %%mm3, " #out1 "\n"\ 00768 "psubusb " #in0 ", %%mm2\n"\ 00769 "psubusb " #in1 ", %%mm3\n"\ 00770 "psubusb " #out0 ", " #in0 "\n"\ 00771 "psubusb " #out1 ", " #in1 "\n"\ 00772 "por %%mm2, " #in0 "\n"\ 00773 "por %%mm3, " #in1 "\n"\ 00774 "movq " #in0 ", %%mm2\n"\ 00775 "movq " #in1 ", %%mm3\n"\ 00776 "punpcklbw %%mm7, " #in0 "\n"\ 00777 "punpcklbw %%mm7, " #in1 "\n"\ 00778 "punpckhbw %%mm7, %%mm2\n"\ 00779 "punpckhbw %%mm7, %%mm3\n"\ 00780 "paddw " #in1 ", " #in0 "\n"\ 00781 "paddw %%mm3, %%mm2\n"\ 00782 "paddw %%mm2, " #in0 "\n"\ 00783 "paddw " #in0 ", %%mm6\n" 00784 00785 00786 __asm__ volatile ( 00787 "movl %4,%%ecx\n" 00788 "pxor %%mm6,%%mm6\n" 00789 "pcmpeqw %%mm7,%%mm7\n" 00790 "psllw $15, %%mm7\n" 00791 "packsswb %%mm7, %%mm7\n" 00792 "movq (%0),%%mm0\n" 00793 "movq (%1),%%mm2\n" 00794 "movq 8(%0),%%mm1\n" 00795 "movq 8(%1),%%mm3\n" 00796 "add %3,%0\n" 00797 "add %3,%1\n" 00798 "psubb %%mm2, %%mm0\n" 00799 "psubb %%mm3, %%mm1\n" 00800 "pxor %%mm7, %%mm0\n" 00801 "pxor %%mm7, %%mm1\n" 00802 "jmp 2f\n" 00803 "1:\n" 00804 00805 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00806 "2:\n" 00807 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00808 00809 "subl $2, %%ecx\n" 00810 "jnz 1b\n" 00811 00812 "movq %%mm6,%%mm0\n" 00813 "psrlq $32, %%mm6\n" 00814 "paddw %%mm6,%%mm0\n" 00815 "movq %%mm0,%%mm6\n" 00816 "psrlq $16, %%mm0\n" 00817 "paddw %%mm6,%%mm0\n" 00818 "movd %%mm0,%2\n" 00819 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00820 : "r" ((x86_reg)line_size) , "m" (h) 00821 : "%ecx"); 00822 return tmp & 0x7FFF; 00823 } 00824 #undef SUM 00825 00826 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 00827 int tmp; 00828 00829 assert( (((int)pix1) & 7) == 0); 00830 assert( (((int)pix2) & 7) == 0); 00831 assert((line_size &7) ==0); 00832 00833 #define SUM(in0, in1, out0, out1) \ 00834 "movq (%0)," #out0 "\n"\ 00835 "movq (%1),%%mm2\n"\ 00836 "movq 8(%0)," #out1 "\n"\ 00837 "movq 8(%1),%%mm3\n"\ 00838 "add %3,%0\n"\ 00839 "add %3,%1\n"\ 00840 "psubb %%mm2, " #out0 "\n"\ 00841 "psubb %%mm3, " #out1 "\n"\ 00842 "pxor %%mm7, " #out0 "\n"\ 00843 "pxor %%mm7, " #out1 "\n"\ 00844 "psadbw " #out0 ", " #in0 "\n"\ 00845 "psadbw " #out1 ", " #in1 "\n"\ 00846 "paddw " #in1 ", " #in0 "\n"\ 00847 "paddw " #in0 ", %%mm6\n" 00848 00849 __asm__ volatile ( 00850 "movl %4,%%ecx\n" 00851 "pxor %%mm6,%%mm6\n" 00852 "pcmpeqw %%mm7,%%mm7\n" 00853 "psllw $15, %%mm7\n" 00854 "packsswb %%mm7, %%mm7\n" 00855 "movq (%0),%%mm0\n" 00856 "movq (%1),%%mm2\n" 00857 "movq 8(%0),%%mm1\n" 00858 "movq 8(%1),%%mm3\n" 00859 "add %3,%0\n" 00860 "add %3,%1\n" 00861 "psubb %%mm2, %%mm0\n" 00862 "psubb %%mm3, %%mm1\n" 00863 "pxor %%mm7, %%mm0\n" 00864 "pxor %%mm7, %%mm1\n" 00865 "jmp 2f\n" 00866 "1:\n" 00867 00868 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 00869 "2:\n" 00870 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 00871 00872 "subl $2, %%ecx\n" 00873 "jnz 1b\n" 00874 00875 "movd %%mm6,%2\n" 00876 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 00877 : "r" ((x86_reg)line_size) , "m" (h) 00878 : "%ecx"); 00879 return tmp; 00880 } 00881 #undef SUM 00882 00883 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 00884 x86_reg i=0; 00885 __asm__ volatile( 00886 "1: \n\t" 00887 "movq (%2, %0), %%mm0 \n\t" 00888 "movq (%1, %0), %%mm1 \n\t" 00889 "psubb %%mm0, %%mm1 \n\t" 00890 "movq %%mm1, (%3, %0) \n\t" 00891 "movq 8(%2, %0), %%mm0 \n\t" 00892 "movq 8(%1, %0), %%mm1 \n\t" 00893 "psubb %%mm0, %%mm1 \n\t" 00894 "movq %%mm1, 8(%3, %0) \n\t" 00895 "add $16, %0 \n\t" 00896 "cmp %4, %0 \n\t" 00897 " jb 1b \n\t" 00898 : "+r" (i) 00899 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) 00900 ); 00901 for(; i<w; i++) 00902 dst[i+0] = src1[i+0]-src2[i+0]; 00903 } 00904 00905 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ 00906 x86_reg i=0; 00907 uint8_t l, lt; 00908 00909 __asm__ volatile( 00910 "1: \n\t" 00911 "movq -1(%1, %0), %%mm0 \n\t" // LT 00912 "movq (%1, %0), %%mm1 \n\t" // T 00913 "movq -1(%2, %0), %%mm2 \n\t" // L 00914 "movq (%2, %0), %%mm3 \n\t" // X 00915 "movq %%mm2, %%mm4 \n\t" // L 00916 "psubb %%mm0, %%mm2 \n\t" 00917 "paddb %%mm1, %%mm2 \n\t" // L + T - LT 00918 "movq %%mm4, %%mm5 \n\t" // L 00919 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 00920 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 00921 "pminub %%mm2, %%mm4 \n\t" 00922 "pmaxub %%mm1, %%mm4 \n\t" 00923 "psubb %%mm4, %%mm3 \n\t" // dst - pred 00924 "movq %%mm3, (%3, %0) \n\t" 00925 "add $8, %0 \n\t" 00926 "cmp %4, %0 \n\t" 00927 " jb 1b \n\t" 00928 : "+r" (i) 00929 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) 00930 ); 00931 00932 l= *left; 00933 lt= *left_top; 00934 00935 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); 00936 00937 *left_top= src1[w-1]; 00938 *left = src2[w-1]; 00939 } 00940 00941 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ 00942 "mov"#m" "#p1", "#a" \n\t"\ 00943 "mov"#m" "#p2", "#t" \n\t"\ 00944 "punpcklbw "#a", "#t" \n\t"\ 00945 "punpcklbw "#a", "#a" \n\t"\ 00946 "psubw "#t", "#a" \n\t"\ 00947 00948 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ 00949 uint8_t *p1b=p1, *p2b=p2;\ 00950 __asm__ volatile(\ 00951 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ 00952 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ 00953 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ 00954 "add %4, %1 \n\t"\ 00955 "add %4, %2 \n\t"\ 00956 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ 00957 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ 00958 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ 00959 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ 00960 "mov"#m1" "#mm"0, %0 \n\t"\ 00961 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ 00962 "mov"#m1" %0, "#mm"0 \n\t"\ 00963 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ 00964 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ 00965 );\ 00966 } 00967 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) 00968 00969 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) 00970 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) 00971 00972 #define LBUTTERFLY2(a1,b1,a2,b2)\ 00973 "paddw " #b1 ", " #a1 " \n\t"\ 00974 "paddw " #b2 ", " #a2 " \n\t"\ 00975 "paddw " #b1 ", " #b1 " \n\t"\ 00976 "paddw " #b2 ", " #b2 " \n\t"\ 00977 "psubw " #a1 ", " #b1 " \n\t"\ 00978 "psubw " #a2 ", " #b2 " \n\t" 00979 00980 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ 00981 LBUTTERFLY2(m0, m1, m2, m3)\ 00982 LBUTTERFLY2(m4, m5, m6, m7)\ 00983 LBUTTERFLY2(m0, m2, m1, m3)\ 00984 LBUTTERFLY2(m4, m6, m5, m7)\ 00985 LBUTTERFLY2(m0, m4, m1, m5)\ 00986 LBUTTERFLY2(m2, m6, m3, m7)\ 00987 00988 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) 00989 00990 #define MMABS_MMX(a,z)\ 00991 "pxor " #z ", " #z " \n\t"\ 00992 "pcmpgtw " #a ", " #z " \n\t"\ 00993 "pxor " #z ", " #a " \n\t"\ 00994 "psubw " #z ", " #a " \n\t" 00995 00996 #define MMABS_MMX2(a,z)\ 00997 "pxor " #z ", " #z " \n\t"\ 00998 "psubw " #a ", " #z " \n\t"\ 00999 "pmaxsw " #z ", " #a " \n\t" 01000 01001 #define MMABS_SSSE3(a,z)\ 01002 "pabsw " #a ", " #a " \n\t" 01003 01004 #define MMABS_SUM(a,z, sum)\ 01005 MMABS(a,z)\ 01006 "paddusw " #a ", " #sum " \n\t" 01007 01008 #define MMABS_SUM_8x8_NOSPILL\ 01009 MMABS(%%xmm0, %%xmm8)\ 01010 MMABS(%%xmm1, %%xmm9)\ 01011 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ 01012 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ 01013 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ 01014 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ 01015 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ 01016 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ 01017 "paddusw %%xmm1, %%xmm0 \n\t" 01018 01019 #if ARCH_X86_64 01020 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL 01021 #else 01022 #define MMABS_SUM_8x8_SSE2\ 01023 "movdqa %%xmm7, (%1) \n\t"\ 01024 MMABS(%%xmm0, %%xmm7)\ 01025 MMABS(%%xmm1, %%xmm7)\ 01026 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ 01027 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ 01028 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ 01029 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ 01030 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ 01031 "movdqa (%1), %%xmm2 \n\t"\ 01032 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ 01033 "paddusw %%xmm1, %%xmm0 \n\t" 01034 #endif 01035 01036 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 01037 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, 01038 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 01039 #define HSUM_MMX(a, t, dst)\ 01040 "movq "#a", "#t" \n\t"\ 01041 "psrlq $32, "#a" \n\t"\ 01042 "paddusw "#t", "#a" \n\t"\ 01043 "movq "#a", "#t" \n\t"\ 01044 "psrlq $16, "#a" \n\t"\ 01045 "paddusw "#t", "#a" \n\t"\ 01046 "movd "#a", "#dst" \n\t"\ 01047 01048 #define HSUM_MMX2(a, t, dst)\ 01049 "pshufw $0x0E, "#a", "#t" \n\t"\ 01050 "paddusw "#t", "#a" \n\t"\ 01051 "pshufw $0x01, "#a", "#t" \n\t"\ 01052 "paddusw "#t", "#a" \n\t"\ 01053 "movd "#a", "#dst" \n\t"\ 01054 01055 #define HSUM_SSE2(a, t, dst)\ 01056 "movhlps "#a", "#t" \n\t"\ 01057 "paddusw "#t", "#a" \n\t"\ 01058 "pshuflw $0x0E, "#a", "#t" \n\t"\ 01059 "paddusw "#t", "#a" \n\t"\ 01060 "pshuflw $0x01, "#a", "#t" \n\t"\ 01061 "paddusw "#t", "#a" \n\t"\ 01062 "movd "#a", "#dst" \n\t"\ 01063 01064 #define HADAMARD8_DIFF_MMX(cpu) \ 01065 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 01066 DECLARE_ALIGNED(8, uint64_t, temp)[13];\ 01067 int sum;\ 01068 \ 01069 assert(h==8);\ 01070 \ 01071 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ 01072 \ 01073 __asm__ volatile(\ 01074 HADAMARD48\ 01075 \ 01076 "movq %%mm7, 96(%1) \n\t"\ 01077 \ 01078 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 01079 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 01080 \ 01081 "movq 96(%1), %%mm7 \n\t"\ 01082 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 01083 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ 01084 \ 01085 : "=r" (sum)\ 01086 : "r"(temp)\ 01087 );\ 01088 \ 01089 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ 01090 \ 01091 __asm__ volatile(\ 01092 HADAMARD48\ 01093 \ 01094 "movq %%mm7, 96(%1) \n\t"\ 01095 \ 01096 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 01097 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 01098 \ 01099 "movq 96(%1), %%mm7 \n\t"\ 01100 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 01101 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ 01102 "movq %%mm6, %%mm7 \n\t"\ 01103 "movq %%mm0, %%mm6 \n\t"\ 01104 \ 01105 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 01106 \ 01107 HADAMARD48\ 01108 "movq %%mm7, 64(%1) \n\t"\ 01109 MMABS(%%mm0, %%mm7)\ 01110 MMABS(%%mm1, %%mm7)\ 01111 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 01112 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 01113 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 01114 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 01115 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 01116 "movq 64(%1), %%mm2 \n\t"\ 01117 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 01118 "paddusw %%mm1, %%mm0 \n\t"\ 01119 "movq %%mm0, 64(%1) \n\t"\ 01120 \ 01121 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 01122 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ 01123 \ 01124 HADAMARD48\ 01125 "movq %%mm7, (%1) \n\t"\ 01126 MMABS(%%mm0, %%mm7)\ 01127 MMABS(%%mm1, %%mm7)\ 01128 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 01129 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 01130 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 01131 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 01132 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 01133 "movq (%1), %%mm2 \n\t"\ 01134 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 01135 "paddusw 64(%1), %%mm0 \n\t"\ 01136 "paddusw %%mm1, %%mm0 \n\t"\ 01137 \ 01138 HSUM(%%mm0, %%mm1, %0)\ 01139 \ 01140 : "=r" (sum)\ 01141 : "r"(temp)\ 01142 );\ 01143 return sum&0xFFFF;\ 01144 }\ 01145 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 01146 01147 #define HADAMARD8_DIFF_SSE2(cpu) \ 01148 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 01149 DECLARE_ALIGNED(16, uint64_t, temp)[4];\ 01150 int sum;\ 01151 \ 01152 assert(h==8);\ 01153 \ 01154 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ 01155 \ 01156 __asm__ volatile(\ 01157 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ 01158 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ 01159 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ 01160 MMABS_SUM_8x8\ 01161 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ 01162 : "=r" (sum)\ 01163 : "r"(temp)\ 01164 );\ 01165 return sum&0xFFFF;\ 01166 }\ 01167 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 01168 01169 #define MMABS(a,z) MMABS_MMX(a,z) 01170 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 01171 HADAMARD8_DIFF_MMX(mmx) 01172 #undef MMABS 01173 #undef HSUM 01174 01175 #define MMABS(a,z) MMABS_MMX2(a,z) 01176 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 01177 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 01178 HADAMARD8_DIFF_MMX(mmx2) 01179 HADAMARD8_DIFF_SSE2(sse2) 01180 #undef MMABS 01181 #undef MMABS_SUM_8x8 01182 #undef HSUM 01183 01184 #if HAVE_SSSE3 01185 #define MMABS(a,z) MMABS_SSSE3(a,z) 01186 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL 01187 HADAMARD8_DIFF_SSE2(ssse3) 01188 #undef MMABS 01189 #undef MMABS_SUM_8x8 01190 #endif 01191 01192 #define DCT_SAD4(m,mm,o)\ 01193 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 01194 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 01195 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 01196 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 01197 MMABS_SUM(mm##2, mm##6, mm##0)\ 01198 MMABS_SUM(mm##3, mm##7, mm##1)\ 01199 MMABS_SUM(mm##4, mm##6, mm##0)\ 01200 MMABS_SUM(mm##5, mm##7, mm##1)\ 01201 01202 #define DCT_SAD_MMX\ 01203 "pxor %%mm0, %%mm0 \n\t"\ 01204 "pxor %%mm1, %%mm1 \n\t"\ 01205 DCT_SAD4(q, %%mm, 0)\ 01206 DCT_SAD4(q, %%mm, 8)\ 01207 DCT_SAD4(q, %%mm, 64)\ 01208 DCT_SAD4(q, %%mm, 72)\ 01209 "paddusw %%mm1, %%mm0 \n\t"\ 01210 HSUM(%%mm0, %%mm1, %0) 01211 01212 #define DCT_SAD_SSE2\ 01213 "pxor %%xmm0, %%xmm0 \n\t"\ 01214 "pxor %%xmm1, %%xmm1 \n\t"\ 01215 DCT_SAD4(dqa, %%xmm, 0)\ 01216 DCT_SAD4(dqa, %%xmm, 64)\ 01217 "paddusw %%xmm1, %%xmm0 \n\t"\ 01218 HSUM(%%xmm0, %%xmm1, %0) 01219 01220 #define DCT_SAD_FUNC(cpu) \ 01221 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 01222 int sum;\ 01223 __asm__ volatile(\ 01224 DCT_SAD\ 01225 :"=r"(sum)\ 01226 :"r"(block)\ 01227 );\ 01228 return sum&0xFFFF;\ 01229 } 01230 01231 #define DCT_SAD DCT_SAD_MMX 01232 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 01233 #define MMABS(a,z) MMABS_MMX(a,z) 01234 DCT_SAD_FUNC(mmx) 01235 #undef MMABS 01236 #undef HSUM 01237 01238 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 01239 #define MMABS(a,z) MMABS_MMX2(a,z) 01240 DCT_SAD_FUNC(mmx2) 01241 #undef HSUM 01242 #undef DCT_SAD 01243 01244 #define DCT_SAD DCT_SAD_SSE2 01245 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) 01246 DCT_SAD_FUNC(sse2) 01247 #undef MMABS 01248 01249 #if HAVE_SSSE3 01250 #define MMABS(a,z) MMABS_SSSE3(a,z) 01251 DCT_SAD_FUNC(ssse3) 01252 #undef MMABS 01253 #endif 01254 #undef HSUM 01255 #undef DCT_SAD 01256 01257 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 01258 int sum; 01259 x86_reg i=size; 01260 __asm__ volatile( 01261 "pxor %%mm4, %%mm4 \n" 01262 "1: \n" 01263 "sub $8, %0 \n" 01264 "movq (%2,%0), %%mm2 \n" 01265 "movq (%3,%0,2), %%mm0 \n" 01266 "movq 8(%3,%0,2), %%mm1 \n" 01267 "punpckhbw %%mm2, %%mm3 \n" 01268 "punpcklbw %%mm2, %%mm2 \n" 01269 "psraw $8, %%mm3 \n" 01270 "psraw $8, %%mm2 \n" 01271 "psubw %%mm3, %%mm1 \n" 01272 "psubw %%mm2, %%mm0 \n" 01273 "pmaddwd %%mm1, %%mm1 \n" 01274 "pmaddwd %%mm0, %%mm0 \n" 01275 "paddd %%mm1, %%mm4 \n" 01276 "paddd %%mm0, %%mm4 \n" 01277 "jg 1b \n" 01278 "movq %%mm4, %%mm3 \n" 01279 "psrlq $32, %%mm3 \n" 01280 "paddd %%mm3, %%mm4 \n" 01281 "movd %%mm4, %1 \n" 01282 :"+r"(i), "=r"(sum) 01283 :"r"(pix1), "r"(pix2) 01284 ); 01285 return sum; 01286 } 01287 01288 #define PHADDD(a, t)\ 01289 "movq "#a", "#t" \n\t"\ 01290 "psrlq $32, "#a" \n\t"\ 01291 "paddd "#t", "#a" \n\t" 01292 /* 01293 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] 01294 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] 01295 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] 01296 */ 01297 #define PMULHRW(x, y, s, o)\ 01298 "pmulhw " #s ", "#x " \n\t"\ 01299 "pmulhw " #s ", "#y " \n\t"\ 01300 "paddw " #o ", "#x " \n\t"\ 01301 "paddw " #o ", "#y " \n\t"\ 01302 "psraw $1, "#x " \n\t"\ 01303 "psraw $1, "#y " \n\t" 01304 #define DEF(x) x ## _mmx 01305 #define SET_RND MOVQ_WONE 01306 #define SCALE_OFFSET 1 01307 01308 #include "dsputil_mmx_qns_template.c" 01309 01310 #undef DEF 01311 #undef SET_RND 01312 #undef SCALE_OFFSET 01313 #undef PMULHRW 01314 01315 #define DEF(x) x ## _3dnow 01316 #define SET_RND(x) 01317 #define SCALE_OFFSET 0 01318 #define PMULHRW(x, y, s, o)\ 01319 "pmulhrw " #s ", "#x " \n\t"\ 01320 "pmulhrw " #s ", "#y " \n\t" 01321 01322 #include "dsputil_mmx_qns_template.c" 01323 01324 #undef DEF 01325 #undef SET_RND 01326 #undef SCALE_OFFSET 01327 #undef PMULHRW 01328 01329 #if HAVE_SSSE3 01330 #undef PHADDD 01331 #define DEF(x) x ## _ssse3 01332 #define SET_RND(x) 01333 #define SCALE_OFFSET -1 01334 #define PHADDD(a, t)\ 01335 "pshufw $0x0E, "#a", "#t" \n\t"\ 01336 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 01337 #define PMULHRW(x, y, s, o)\ 01338 "pmulhrsw " #s ", "#x " \n\t"\ 01339 "pmulhrsw " #s ", "#y " \n\t" 01340 01341 #include "dsputil_mmx_qns_template.c" 01342 01343 #undef DEF 01344 #undef SET_RND 01345 #undef SCALE_OFFSET 01346 #undef PMULHRW 01347 #undef PHADDD 01348 #endif //HAVE_SSSE3 01349 01350 01351 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) 01352 { 01353 if (mm_flags & FF_MM_MMX) { 01354 const int dct_algo = avctx->dct_algo; 01355 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 01356 if(mm_flags & FF_MM_SSE2){ 01357 c->fdct = ff_fdct_sse2; 01358 }else if(mm_flags & FF_MM_MMX2){ 01359 c->fdct = ff_fdct_mmx2; 01360 }else{ 01361 c->fdct = ff_fdct_mmx; 01362 } 01363 } 01364 01365 c->get_pixels = get_pixels_mmx; 01366 c->diff_pixels = diff_pixels_mmx; 01367 c->pix_sum = pix_sum16_mmx; 01368 01369 c->diff_bytes= diff_bytes_mmx; 01370 c->sum_abs_dctelem= sum_abs_dctelem_mmx; 01371 01372 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 01373 c->hadamard8_diff[1]= hadamard8_diff_mmx; 01374 01375 c->pix_norm1 = pix_norm1_mmx; 01376 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx; 01377 c->sse[1] = sse8_mmx; 01378 c->vsad[4]= vsad_intra16_mmx; 01379 01380 c->nsse[0] = nsse16_mmx; 01381 c->nsse[1] = nsse8_mmx; 01382 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01383 c->vsad[0] = vsad16_mmx; 01384 } 01385 01386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01387 c->try_8x8basis= try_8x8basis_mmx; 01388 } 01389 c->add_8x8basis= add_8x8basis_mmx; 01390 01391 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 01392 01393 01394 if (mm_flags & FF_MM_MMX2) { 01395 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; 01396 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; 01397 c->hadamard8_diff[1]= hadamard8_diff_mmx2; 01398 c->vsad[4]= vsad_intra16_mmx2; 01399 01400 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01401 c->vsad[0] = vsad16_mmx2; 01402 } 01403 01404 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 01405 } 01406 01407 if(mm_flags & FF_MM_SSE2){ 01408 c->get_pixels = get_pixels_sse2; 01409 c->sum_abs_dctelem= sum_abs_dctelem_sse2; 01410 c->hadamard8_diff[0]= hadamard8_diff16_sse2; 01411 c->hadamard8_diff[1]= hadamard8_diff_sse2; 01412 #if CONFIG_LPC 01413 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; 01414 #endif 01415 } 01416 01417 #if HAVE_SSSE3 01418 if(mm_flags & FF_MM_SSSE3){ 01419 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01420 c->try_8x8basis= try_8x8basis_ssse3; 01421 } 01422 c->add_8x8basis= add_8x8basis_ssse3; 01423 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; 01424 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; 01425 c->hadamard8_diff[1]= hadamard8_diff_ssse3; 01426 } 01427 #endif 01428 01429 if(mm_flags & FF_MM_3DNOW){ 01430 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 01431 c->try_8x8basis= try_8x8basis_3dnow; 01432 } 01433 c->add_8x8basis= add_8x8basis_3dnow; 01434 } 01435 } 01436 01437 dsputil_init_pix_mmx(c, avctx); 01438 }