Libav
|
00001 /* 00002 * software RGB to RGB converter 00003 * pluralize by software PAL8 to RGB converter 00004 * software YUV to YUV converter 00005 * software YUV to RGB converter 00006 * Written by Nick Kurshev. 00007 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) 00008 * lot of big-endian byte order fixes by Alex Beregszaszi 00009 * 00010 * This file is part of FFmpeg. 00011 * 00012 * FFmpeg is free software; you can redistribute it and/or 00013 * modify it under the terms of the GNU Lesser General Public 00014 * License as published by the Free Software Foundation; either 00015 * version 2.1 of the License, or (at your option) any later version. 00016 * 00017 * FFmpeg is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00020 * Lesser General Public License for more details. 00021 * 00022 * You should have received a copy of the GNU Lesser General Public 00023 * License along with FFmpeg; if not, write to the Free Software 00024 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00025 */ 00026 00027 #include <stddef.h> 00028 00029 #undef PREFETCH 00030 #undef MOVNTQ 00031 #undef EMMS 00032 #undef SFENCE 00033 #undef MMREG_SIZE 00034 #undef PAVGB 00035 00036 #if HAVE_SSE2 00037 #define MMREG_SIZE 16 00038 #else 00039 #define MMREG_SIZE 8 00040 #endif 00041 00042 #if HAVE_AMD3DNOW 00043 #define PREFETCH "prefetch" 00044 #define PAVGB "pavgusb" 00045 #elif HAVE_MMX2 00046 #define PREFETCH "prefetchnta" 00047 #define PAVGB "pavgb" 00048 #else 00049 #define PREFETCH " # nop" 00050 #endif 00051 00052 #if HAVE_AMD3DNOW 00053 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ 00054 #define EMMS "femms" 00055 #else 00056 #define EMMS "emms" 00057 #endif 00058 00059 #if HAVE_MMX2 00060 #define MOVNTQ "movntq" 00061 #define SFENCE "sfence" 00062 #else 00063 #define MOVNTQ "movq" 00064 #define SFENCE " # nop" 00065 #endif 00066 00067 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) 00068 { 00069 uint8_t *dest = dst; 00070 const uint8_t *s = src; 00071 const uint8_t *end; 00072 #if HAVE_MMX 00073 const uint8_t *mm_end; 00074 #endif 00075 end = s + src_size; 00076 #if HAVE_MMX 00077 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00078 mm_end = end - 23; 00079 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); 00080 while (s < mm_end) { 00081 __asm__ volatile( 00082 PREFETCH" 32%1 \n\t" 00083 "movd %1, %%mm0 \n\t" 00084 "punpckldq 3%1, %%mm0 \n\t" 00085 "movd 6%1, %%mm1 \n\t" 00086 "punpckldq 9%1, %%mm1 \n\t" 00087 "movd 12%1, %%mm2 \n\t" 00088 "punpckldq 15%1, %%mm2 \n\t" 00089 "movd 18%1, %%mm3 \n\t" 00090 "punpckldq 21%1, %%mm3 \n\t" 00091 "por %%mm7, %%mm0 \n\t" 00092 "por %%mm7, %%mm1 \n\t" 00093 "por %%mm7, %%mm2 \n\t" 00094 "por %%mm7, %%mm3 \n\t" 00095 MOVNTQ" %%mm0, %0 \n\t" 00096 MOVNTQ" %%mm1, 8%0 \n\t" 00097 MOVNTQ" %%mm2, 16%0 \n\t" 00098 MOVNTQ" %%mm3, 24%0" 00099 :"=m"(*dest) 00100 :"m"(*s) 00101 :"memory"); 00102 dest += 32; 00103 s += 24; 00104 } 00105 __asm__ volatile(SFENCE:::"memory"); 00106 __asm__ volatile(EMMS:::"memory"); 00107 #endif 00108 while (s < end) { 00109 #if HAVE_BIGENDIAN 00110 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ 00111 *dest++ = 255; 00112 *dest++ = s[2]; 00113 *dest++ = s[1]; 00114 *dest++ = s[0]; 00115 s+=3; 00116 #else 00117 *dest++ = *s++; 00118 *dest++ = *s++; 00119 *dest++ = *s++; 00120 *dest++ = 255; 00121 #endif 00122 } 00123 } 00124 00125 #define STORE_BGR24_MMX \ 00126 "psrlq $8, %%mm2 \n\t" \ 00127 "psrlq $8, %%mm3 \n\t" \ 00128 "psrlq $8, %%mm6 \n\t" \ 00129 "psrlq $8, %%mm7 \n\t" \ 00130 "pand "MANGLE(mask24l)", %%mm0\n\t" \ 00131 "pand "MANGLE(mask24l)", %%mm1\n\t" \ 00132 "pand "MANGLE(mask24l)", %%mm4\n\t" \ 00133 "pand "MANGLE(mask24l)", %%mm5\n\t" \ 00134 "pand "MANGLE(mask24h)", %%mm2\n\t" \ 00135 "pand "MANGLE(mask24h)", %%mm3\n\t" \ 00136 "pand "MANGLE(mask24h)", %%mm6\n\t" \ 00137 "pand "MANGLE(mask24h)", %%mm7\n\t" \ 00138 "por %%mm2, %%mm0 \n\t" \ 00139 "por %%mm3, %%mm1 \n\t" \ 00140 "por %%mm6, %%mm4 \n\t" \ 00141 "por %%mm7, %%mm5 \n\t" \ 00142 \ 00143 "movq %%mm1, %%mm2 \n\t" \ 00144 "movq %%mm4, %%mm3 \n\t" \ 00145 "psllq $48, %%mm2 \n\t" \ 00146 "psllq $32, %%mm3 \n\t" \ 00147 "pand "MANGLE(mask24hh)", %%mm2\n\t" \ 00148 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \ 00149 "por %%mm2, %%mm0 \n\t" \ 00150 "psrlq $16, %%mm1 \n\t" \ 00151 "psrlq $32, %%mm4 \n\t" \ 00152 "psllq $16, %%mm5 \n\t" \ 00153 "por %%mm3, %%mm1 \n\t" \ 00154 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \ 00155 "por %%mm5, %%mm4 \n\t" \ 00156 \ 00157 MOVNTQ" %%mm0, %0 \n\t" \ 00158 MOVNTQ" %%mm1, 8%0 \n\t" \ 00159 MOVNTQ" %%mm4, 16%0" 00160 00161 00162 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) 00163 { 00164 uint8_t *dest = dst; 00165 const uint8_t *s = src; 00166 const uint8_t *end; 00167 #if HAVE_MMX 00168 const uint8_t *mm_end; 00169 #endif 00170 end = s + src_size; 00171 #if HAVE_MMX 00172 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00173 mm_end = end - 31; 00174 while (s < mm_end) { 00175 __asm__ volatile( 00176 PREFETCH" 32%1 \n\t" 00177 "movq %1, %%mm0 \n\t" 00178 "movq 8%1, %%mm1 \n\t" 00179 "movq 16%1, %%mm4 \n\t" 00180 "movq 24%1, %%mm5 \n\t" 00181 "movq %%mm0, %%mm2 \n\t" 00182 "movq %%mm1, %%mm3 \n\t" 00183 "movq %%mm4, %%mm6 \n\t" 00184 "movq %%mm5, %%mm7 \n\t" 00185 STORE_BGR24_MMX 00186 :"=m"(*dest) 00187 :"m"(*s) 00188 :"memory"); 00189 dest += 24; 00190 s += 32; 00191 } 00192 __asm__ volatile(SFENCE:::"memory"); 00193 __asm__ volatile(EMMS:::"memory"); 00194 #endif 00195 while (s < end) { 00196 #if HAVE_BIGENDIAN 00197 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ 00198 s++; 00199 dest[2] = *s++; 00200 dest[1] = *s++; 00201 dest[0] = *s++; 00202 dest += 3; 00203 #else 00204 *dest++ = *s++; 00205 *dest++ = *s++; 00206 *dest++ = *s++; 00207 s++; 00208 #endif 00209 } 00210 } 00211 00212 /* 00213 original by Strepto/Astral 00214 ported to gcc & bugfixed: A'rpi 00215 MMX2, 3DNOW optimization by Nick Kurshev 00216 32-bit C version, and and&add trick by Michael Niedermayer 00217 */ 00218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size) 00219 { 00220 register const uint8_t* s=src; 00221 register uint8_t* d=dst; 00222 register const uint8_t *end; 00223 const uint8_t *mm_end; 00224 end = s + src_size; 00225 #if HAVE_MMX 00226 __asm__ volatile(PREFETCH" %0"::"m"(*s)); 00227 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); 00228 mm_end = end - 15; 00229 while (s<mm_end) { 00230 __asm__ volatile( 00231 PREFETCH" 32%1 \n\t" 00232 "movq %1, %%mm0 \n\t" 00233 "movq 8%1, %%mm2 \n\t" 00234 "movq %%mm0, %%mm1 \n\t" 00235 "movq %%mm2, %%mm3 \n\t" 00236 "pand %%mm4, %%mm0 \n\t" 00237 "pand %%mm4, %%mm2 \n\t" 00238 "paddw %%mm1, %%mm0 \n\t" 00239 "paddw %%mm3, %%mm2 \n\t" 00240 MOVNTQ" %%mm0, %0 \n\t" 00241 MOVNTQ" %%mm2, 8%0" 00242 :"=m"(*d) 00243 :"m"(*s) 00244 ); 00245 d+=16; 00246 s+=16; 00247 } 00248 __asm__ volatile(SFENCE:::"memory"); 00249 __asm__ volatile(EMMS:::"memory"); 00250 #endif 00251 mm_end = end - 3; 00252 while (s < mm_end) { 00253 register unsigned x= *((const uint32_t *)s); 00254 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); 00255 d+=4; 00256 s+=4; 00257 } 00258 if (s < end) { 00259 register unsigned short x= *((const uint16_t *)s); 00260 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); 00261 } 00262 } 00263 00264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size) 00265 { 00266 register const uint8_t* s=src; 00267 register uint8_t* d=dst; 00268 register const uint8_t *end; 00269 const uint8_t *mm_end; 00270 end = s + src_size; 00271 #if HAVE_MMX 00272 __asm__ volatile(PREFETCH" %0"::"m"(*s)); 00273 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); 00274 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); 00275 mm_end = end - 15; 00276 while (s<mm_end) { 00277 __asm__ volatile( 00278 PREFETCH" 32%1 \n\t" 00279 "movq %1, %%mm0 \n\t" 00280 "movq 8%1, %%mm2 \n\t" 00281 "movq %%mm0, %%mm1 \n\t" 00282 "movq %%mm2, %%mm3 \n\t" 00283 "psrlq $1, %%mm0 \n\t" 00284 "psrlq $1, %%mm2 \n\t" 00285 "pand %%mm7, %%mm0 \n\t" 00286 "pand %%mm7, %%mm2 \n\t" 00287 "pand %%mm6, %%mm1 \n\t" 00288 "pand %%mm6, %%mm3 \n\t" 00289 "por %%mm1, %%mm0 \n\t" 00290 "por %%mm3, %%mm2 \n\t" 00291 MOVNTQ" %%mm0, %0 \n\t" 00292 MOVNTQ" %%mm2, 8%0" 00293 :"=m"(*d) 00294 :"m"(*s) 00295 ); 00296 d+=16; 00297 s+=16; 00298 } 00299 __asm__ volatile(SFENCE:::"memory"); 00300 __asm__ volatile(EMMS:::"memory"); 00301 #endif 00302 mm_end = end - 3; 00303 while (s < mm_end) { 00304 register uint32_t x= *((const uint32_t*)s); 00305 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); 00306 s+=4; 00307 d+=4; 00308 } 00309 if (s < end) { 00310 register uint16_t x= *((const uint16_t*)s); 00311 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); 00312 } 00313 } 00314 00315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) 00316 { 00317 const uint8_t *s = src; 00318 const uint8_t *end; 00319 #if HAVE_MMX 00320 const uint8_t *mm_end; 00321 #endif 00322 uint16_t *d = (uint16_t *)dst; 00323 end = s + src_size; 00324 #if HAVE_MMX 00325 mm_end = end - 15; 00326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) 00327 __asm__ volatile( 00328 "movq %3, %%mm5 \n\t" 00329 "movq %4, %%mm6 \n\t" 00330 "movq %5, %%mm7 \n\t" 00331 "jmp 2f \n\t" 00332 ASMALIGN(4) 00333 "1: \n\t" 00334 PREFETCH" 32(%1) \n\t" 00335 "movd (%1), %%mm0 \n\t" 00336 "movd 4(%1), %%mm3 \n\t" 00337 "punpckldq 8(%1), %%mm0 \n\t" 00338 "punpckldq 12(%1), %%mm3 \n\t" 00339 "movq %%mm0, %%mm1 \n\t" 00340 "movq %%mm3, %%mm4 \n\t" 00341 "pand %%mm6, %%mm0 \n\t" 00342 "pand %%mm6, %%mm3 \n\t" 00343 "pmaddwd %%mm7, %%mm0 \n\t" 00344 "pmaddwd %%mm7, %%mm3 \n\t" 00345 "pand %%mm5, %%mm1 \n\t" 00346 "pand %%mm5, %%mm4 \n\t" 00347 "por %%mm1, %%mm0 \n\t" 00348 "por %%mm4, %%mm3 \n\t" 00349 "psrld $5, %%mm0 \n\t" 00350 "pslld $11, %%mm3 \n\t" 00351 "por %%mm3, %%mm0 \n\t" 00352 MOVNTQ" %%mm0, (%0) \n\t" 00353 "add $16, %1 \n\t" 00354 "add $8, %0 \n\t" 00355 "2: \n\t" 00356 "cmp %2, %1 \n\t" 00357 " jb 1b \n\t" 00358 : "+r" (d), "+r"(s) 00359 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) 00360 ); 00361 #else 00362 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00363 __asm__ volatile( 00364 "movq %0, %%mm7 \n\t" 00365 "movq %1, %%mm6 \n\t" 00366 ::"m"(red_16mask),"m"(green_16mask)); 00367 while (s < mm_end) { 00368 __asm__ volatile( 00369 PREFETCH" 32%1 \n\t" 00370 "movd %1, %%mm0 \n\t" 00371 "movd 4%1, %%mm3 \n\t" 00372 "punpckldq 8%1, %%mm0 \n\t" 00373 "punpckldq 12%1, %%mm3 \n\t" 00374 "movq %%mm0, %%mm1 \n\t" 00375 "movq %%mm0, %%mm2 \n\t" 00376 "movq %%mm3, %%mm4 \n\t" 00377 "movq %%mm3, %%mm5 \n\t" 00378 "psrlq $3, %%mm0 \n\t" 00379 "psrlq $3, %%mm3 \n\t" 00380 "pand %2, %%mm0 \n\t" 00381 "pand %2, %%mm3 \n\t" 00382 "psrlq $5, %%mm1 \n\t" 00383 "psrlq $5, %%mm4 \n\t" 00384 "pand %%mm6, %%mm1 \n\t" 00385 "pand %%mm6, %%mm4 \n\t" 00386 "psrlq $8, %%mm2 \n\t" 00387 "psrlq $8, %%mm5 \n\t" 00388 "pand %%mm7, %%mm2 \n\t" 00389 "pand %%mm7, %%mm5 \n\t" 00390 "por %%mm1, %%mm0 \n\t" 00391 "por %%mm4, %%mm3 \n\t" 00392 "por %%mm2, %%mm0 \n\t" 00393 "por %%mm5, %%mm3 \n\t" 00394 "psllq $16, %%mm3 \n\t" 00395 "por %%mm3, %%mm0 \n\t" 00396 MOVNTQ" %%mm0, %0 \n\t" 00397 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00398 d += 4; 00399 s += 16; 00400 } 00401 #endif 00402 __asm__ volatile(SFENCE:::"memory"); 00403 __asm__ volatile(EMMS:::"memory"); 00404 #endif 00405 while (s < end) { 00406 register int rgb = *(const uint32_t*)s; s += 4; 00407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); 00408 } 00409 } 00410 00411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) 00412 { 00413 const uint8_t *s = src; 00414 const uint8_t *end; 00415 #if HAVE_MMX 00416 const uint8_t *mm_end; 00417 #endif 00418 uint16_t *d = (uint16_t *)dst; 00419 end = s + src_size; 00420 #if HAVE_MMX 00421 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00422 __asm__ volatile( 00423 "movq %0, %%mm7 \n\t" 00424 "movq %1, %%mm6 \n\t" 00425 ::"m"(red_16mask),"m"(green_16mask)); 00426 mm_end = end - 15; 00427 while (s < mm_end) { 00428 __asm__ volatile( 00429 PREFETCH" 32%1 \n\t" 00430 "movd %1, %%mm0 \n\t" 00431 "movd 4%1, %%mm3 \n\t" 00432 "punpckldq 8%1, %%mm0 \n\t" 00433 "punpckldq 12%1, %%mm3 \n\t" 00434 "movq %%mm0, %%mm1 \n\t" 00435 "movq %%mm0, %%mm2 \n\t" 00436 "movq %%mm3, %%mm4 \n\t" 00437 "movq %%mm3, %%mm5 \n\t" 00438 "psllq $8, %%mm0 \n\t" 00439 "psllq $8, %%mm3 \n\t" 00440 "pand %%mm7, %%mm0 \n\t" 00441 "pand %%mm7, %%mm3 \n\t" 00442 "psrlq $5, %%mm1 \n\t" 00443 "psrlq $5, %%mm4 \n\t" 00444 "pand %%mm6, %%mm1 \n\t" 00445 "pand %%mm6, %%mm4 \n\t" 00446 "psrlq $19, %%mm2 \n\t" 00447 "psrlq $19, %%mm5 \n\t" 00448 "pand %2, %%mm2 \n\t" 00449 "pand %2, %%mm5 \n\t" 00450 "por %%mm1, %%mm0 \n\t" 00451 "por %%mm4, %%mm3 \n\t" 00452 "por %%mm2, %%mm0 \n\t" 00453 "por %%mm5, %%mm3 \n\t" 00454 "psllq $16, %%mm3 \n\t" 00455 "por %%mm3, %%mm0 \n\t" 00456 MOVNTQ" %%mm0, %0 \n\t" 00457 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00458 d += 4; 00459 s += 16; 00460 } 00461 __asm__ volatile(SFENCE:::"memory"); 00462 __asm__ volatile(EMMS:::"memory"); 00463 #endif 00464 while (s < end) { 00465 register int rgb = *(const uint32_t*)s; s += 4; 00466 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); 00467 } 00468 } 00469 00470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) 00471 { 00472 const uint8_t *s = src; 00473 const uint8_t *end; 00474 #if HAVE_MMX 00475 const uint8_t *mm_end; 00476 #endif 00477 uint16_t *d = (uint16_t *)dst; 00478 end = s + src_size; 00479 #if HAVE_MMX 00480 mm_end = end - 15; 00481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) 00482 __asm__ volatile( 00483 "movq %3, %%mm5 \n\t" 00484 "movq %4, %%mm6 \n\t" 00485 "movq %5, %%mm7 \n\t" 00486 "jmp 2f \n\t" 00487 ASMALIGN(4) 00488 "1: \n\t" 00489 PREFETCH" 32(%1) \n\t" 00490 "movd (%1), %%mm0 \n\t" 00491 "movd 4(%1), %%mm3 \n\t" 00492 "punpckldq 8(%1), %%mm0 \n\t" 00493 "punpckldq 12(%1), %%mm3 \n\t" 00494 "movq %%mm0, %%mm1 \n\t" 00495 "movq %%mm3, %%mm4 \n\t" 00496 "pand %%mm6, %%mm0 \n\t" 00497 "pand %%mm6, %%mm3 \n\t" 00498 "pmaddwd %%mm7, %%mm0 \n\t" 00499 "pmaddwd %%mm7, %%mm3 \n\t" 00500 "pand %%mm5, %%mm1 \n\t" 00501 "pand %%mm5, %%mm4 \n\t" 00502 "por %%mm1, %%mm0 \n\t" 00503 "por %%mm4, %%mm3 \n\t" 00504 "psrld $6, %%mm0 \n\t" 00505 "pslld $10, %%mm3 \n\t" 00506 "por %%mm3, %%mm0 \n\t" 00507 MOVNTQ" %%mm0, (%0) \n\t" 00508 "add $16, %1 \n\t" 00509 "add $8, %0 \n\t" 00510 "2: \n\t" 00511 "cmp %2, %1 \n\t" 00512 " jb 1b \n\t" 00513 : "+r" (d), "+r"(s) 00514 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) 00515 ); 00516 #else 00517 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00518 __asm__ volatile( 00519 "movq %0, %%mm7 \n\t" 00520 "movq %1, %%mm6 \n\t" 00521 ::"m"(red_15mask),"m"(green_15mask)); 00522 while (s < mm_end) { 00523 __asm__ volatile( 00524 PREFETCH" 32%1 \n\t" 00525 "movd %1, %%mm0 \n\t" 00526 "movd 4%1, %%mm3 \n\t" 00527 "punpckldq 8%1, %%mm0 \n\t" 00528 "punpckldq 12%1, %%mm3 \n\t" 00529 "movq %%mm0, %%mm1 \n\t" 00530 "movq %%mm0, %%mm2 \n\t" 00531 "movq %%mm3, %%mm4 \n\t" 00532 "movq %%mm3, %%mm5 \n\t" 00533 "psrlq $3, %%mm0 \n\t" 00534 "psrlq $3, %%mm3 \n\t" 00535 "pand %2, %%mm0 \n\t" 00536 "pand %2, %%mm3 \n\t" 00537 "psrlq $6, %%mm1 \n\t" 00538 "psrlq $6, %%mm4 \n\t" 00539 "pand %%mm6, %%mm1 \n\t" 00540 "pand %%mm6, %%mm4 \n\t" 00541 "psrlq $9, %%mm2 \n\t" 00542 "psrlq $9, %%mm5 \n\t" 00543 "pand %%mm7, %%mm2 \n\t" 00544 "pand %%mm7, %%mm5 \n\t" 00545 "por %%mm1, %%mm0 \n\t" 00546 "por %%mm4, %%mm3 \n\t" 00547 "por %%mm2, %%mm0 \n\t" 00548 "por %%mm5, %%mm3 \n\t" 00549 "psllq $16, %%mm3 \n\t" 00550 "por %%mm3, %%mm0 \n\t" 00551 MOVNTQ" %%mm0, %0 \n\t" 00552 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00553 d += 4; 00554 s += 16; 00555 } 00556 #endif 00557 __asm__ volatile(SFENCE:::"memory"); 00558 __asm__ volatile(EMMS:::"memory"); 00559 #endif 00560 while (s < end) { 00561 register int rgb = *(const uint32_t*)s; s += 4; 00562 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); 00563 } 00564 } 00565 00566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) 00567 { 00568 const uint8_t *s = src; 00569 const uint8_t *end; 00570 #if HAVE_MMX 00571 const uint8_t *mm_end; 00572 #endif 00573 uint16_t *d = (uint16_t *)dst; 00574 end = s + src_size; 00575 #if HAVE_MMX 00576 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00577 __asm__ volatile( 00578 "movq %0, %%mm7 \n\t" 00579 "movq %1, %%mm6 \n\t" 00580 ::"m"(red_15mask),"m"(green_15mask)); 00581 mm_end = end - 15; 00582 while (s < mm_end) { 00583 __asm__ volatile( 00584 PREFETCH" 32%1 \n\t" 00585 "movd %1, %%mm0 \n\t" 00586 "movd 4%1, %%mm3 \n\t" 00587 "punpckldq 8%1, %%mm0 \n\t" 00588 "punpckldq 12%1, %%mm3 \n\t" 00589 "movq %%mm0, %%mm1 \n\t" 00590 "movq %%mm0, %%mm2 \n\t" 00591 "movq %%mm3, %%mm4 \n\t" 00592 "movq %%mm3, %%mm5 \n\t" 00593 "psllq $7, %%mm0 \n\t" 00594 "psllq $7, %%mm3 \n\t" 00595 "pand %%mm7, %%mm0 \n\t" 00596 "pand %%mm7, %%mm3 \n\t" 00597 "psrlq $6, %%mm1 \n\t" 00598 "psrlq $6, %%mm4 \n\t" 00599 "pand %%mm6, %%mm1 \n\t" 00600 "pand %%mm6, %%mm4 \n\t" 00601 "psrlq $19, %%mm2 \n\t" 00602 "psrlq $19, %%mm5 \n\t" 00603 "pand %2, %%mm2 \n\t" 00604 "pand %2, %%mm5 \n\t" 00605 "por %%mm1, %%mm0 \n\t" 00606 "por %%mm4, %%mm3 \n\t" 00607 "por %%mm2, %%mm0 \n\t" 00608 "por %%mm5, %%mm3 \n\t" 00609 "psllq $16, %%mm3 \n\t" 00610 "por %%mm3, %%mm0 \n\t" 00611 MOVNTQ" %%mm0, %0 \n\t" 00612 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00613 d += 4; 00614 s += 16; 00615 } 00616 __asm__ volatile(SFENCE:::"memory"); 00617 __asm__ volatile(EMMS:::"memory"); 00618 #endif 00619 while (s < end) { 00620 register int rgb = *(const uint32_t*)s; s += 4; 00621 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); 00622 } 00623 } 00624 00625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) 00626 { 00627 const uint8_t *s = src; 00628 const uint8_t *end; 00629 #if HAVE_MMX 00630 const uint8_t *mm_end; 00631 #endif 00632 uint16_t *d = (uint16_t *)dst; 00633 end = s + src_size; 00634 #if HAVE_MMX 00635 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00636 __asm__ volatile( 00637 "movq %0, %%mm7 \n\t" 00638 "movq %1, %%mm6 \n\t" 00639 ::"m"(red_16mask),"m"(green_16mask)); 00640 mm_end = end - 11; 00641 while (s < mm_end) { 00642 __asm__ volatile( 00643 PREFETCH" 32%1 \n\t" 00644 "movd %1, %%mm0 \n\t" 00645 "movd 3%1, %%mm3 \n\t" 00646 "punpckldq 6%1, %%mm0 \n\t" 00647 "punpckldq 9%1, %%mm3 \n\t" 00648 "movq %%mm0, %%mm1 \n\t" 00649 "movq %%mm0, %%mm2 \n\t" 00650 "movq %%mm3, %%mm4 \n\t" 00651 "movq %%mm3, %%mm5 \n\t" 00652 "psrlq $3, %%mm0 \n\t" 00653 "psrlq $3, %%mm3 \n\t" 00654 "pand %2, %%mm0 \n\t" 00655 "pand %2, %%mm3 \n\t" 00656 "psrlq $5, %%mm1 \n\t" 00657 "psrlq $5, %%mm4 \n\t" 00658 "pand %%mm6, %%mm1 \n\t" 00659 "pand %%mm6, %%mm4 \n\t" 00660 "psrlq $8, %%mm2 \n\t" 00661 "psrlq $8, %%mm5 \n\t" 00662 "pand %%mm7, %%mm2 \n\t" 00663 "pand %%mm7, %%mm5 \n\t" 00664 "por %%mm1, %%mm0 \n\t" 00665 "por %%mm4, %%mm3 \n\t" 00666 "por %%mm2, %%mm0 \n\t" 00667 "por %%mm5, %%mm3 \n\t" 00668 "psllq $16, %%mm3 \n\t" 00669 "por %%mm3, %%mm0 \n\t" 00670 MOVNTQ" %%mm0, %0 \n\t" 00671 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00672 d += 4; 00673 s += 12; 00674 } 00675 __asm__ volatile(SFENCE:::"memory"); 00676 __asm__ volatile(EMMS:::"memory"); 00677 #endif 00678 while (s < end) { 00679 const int b = *s++; 00680 const int g = *s++; 00681 const int r = *s++; 00682 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 00683 } 00684 } 00685 00686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) 00687 { 00688 const uint8_t *s = src; 00689 const uint8_t *end; 00690 #if HAVE_MMX 00691 const uint8_t *mm_end; 00692 #endif 00693 uint16_t *d = (uint16_t *)dst; 00694 end = s + src_size; 00695 #if HAVE_MMX 00696 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00697 __asm__ volatile( 00698 "movq %0, %%mm7 \n\t" 00699 "movq %1, %%mm6 \n\t" 00700 ::"m"(red_16mask),"m"(green_16mask)); 00701 mm_end = end - 15; 00702 while (s < mm_end) { 00703 __asm__ volatile( 00704 PREFETCH" 32%1 \n\t" 00705 "movd %1, %%mm0 \n\t" 00706 "movd 3%1, %%mm3 \n\t" 00707 "punpckldq 6%1, %%mm0 \n\t" 00708 "punpckldq 9%1, %%mm3 \n\t" 00709 "movq %%mm0, %%mm1 \n\t" 00710 "movq %%mm0, %%mm2 \n\t" 00711 "movq %%mm3, %%mm4 \n\t" 00712 "movq %%mm3, %%mm5 \n\t" 00713 "psllq $8, %%mm0 \n\t" 00714 "psllq $8, %%mm3 \n\t" 00715 "pand %%mm7, %%mm0 \n\t" 00716 "pand %%mm7, %%mm3 \n\t" 00717 "psrlq $5, %%mm1 \n\t" 00718 "psrlq $5, %%mm4 \n\t" 00719 "pand %%mm6, %%mm1 \n\t" 00720 "pand %%mm6, %%mm4 \n\t" 00721 "psrlq $19, %%mm2 \n\t" 00722 "psrlq $19, %%mm5 \n\t" 00723 "pand %2, %%mm2 \n\t" 00724 "pand %2, %%mm5 \n\t" 00725 "por %%mm1, %%mm0 \n\t" 00726 "por %%mm4, %%mm3 \n\t" 00727 "por %%mm2, %%mm0 \n\t" 00728 "por %%mm5, %%mm3 \n\t" 00729 "psllq $16, %%mm3 \n\t" 00730 "por %%mm3, %%mm0 \n\t" 00731 MOVNTQ" %%mm0, %0 \n\t" 00732 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00733 d += 4; 00734 s += 12; 00735 } 00736 __asm__ volatile(SFENCE:::"memory"); 00737 __asm__ volatile(EMMS:::"memory"); 00738 #endif 00739 while (s < end) { 00740 const int r = *s++; 00741 const int g = *s++; 00742 const int b = *s++; 00743 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 00744 } 00745 } 00746 00747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) 00748 { 00749 const uint8_t *s = src; 00750 const uint8_t *end; 00751 #if HAVE_MMX 00752 const uint8_t *mm_end; 00753 #endif 00754 uint16_t *d = (uint16_t *)dst; 00755 end = s + src_size; 00756 #if HAVE_MMX 00757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00758 __asm__ volatile( 00759 "movq %0, %%mm7 \n\t" 00760 "movq %1, %%mm6 \n\t" 00761 ::"m"(red_15mask),"m"(green_15mask)); 00762 mm_end = end - 11; 00763 while (s < mm_end) { 00764 __asm__ volatile( 00765 PREFETCH" 32%1 \n\t" 00766 "movd %1, %%mm0 \n\t" 00767 "movd 3%1, %%mm3 \n\t" 00768 "punpckldq 6%1, %%mm0 \n\t" 00769 "punpckldq 9%1, %%mm3 \n\t" 00770 "movq %%mm0, %%mm1 \n\t" 00771 "movq %%mm0, %%mm2 \n\t" 00772 "movq %%mm3, %%mm4 \n\t" 00773 "movq %%mm3, %%mm5 \n\t" 00774 "psrlq $3, %%mm0 \n\t" 00775 "psrlq $3, %%mm3 \n\t" 00776 "pand %2, %%mm0 \n\t" 00777 "pand %2, %%mm3 \n\t" 00778 "psrlq $6, %%mm1 \n\t" 00779 "psrlq $6, %%mm4 \n\t" 00780 "pand %%mm6, %%mm1 \n\t" 00781 "pand %%mm6, %%mm4 \n\t" 00782 "psrlq $9, %%mm2 \n\t" 00783 "psrlq $9, %%mm5 \n\t" 00784 "pand %%mm7, %%mm2 \n\t" 00785 "pand %%mm7, %%mm5 \n\t" 00786 "por %%mm1, %%mm0 \n\t" 00787 "por %%mm4, %%mm3 \n\t" 00788 "por %%mm2, %%mm0 \n\t" 00789 "por %%mm5, %%mm3 \n\t" 00790 "psllq $16, %%mm3 \n\t" 00791 "por %%mm3, %%mm0 \n\t" 00792 MOVNTQ" %%mm0, %0 \n\t" 00793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00794 d += 4; 00795 s += 12; 00796 } 00797 __asm__ volatile(SFENCE:::"memory"); 00798 __asm__ volatile(EMMS:::"memory"); 00799 #endif 00800 while (s < end) { 00801 const int b = *s++; 00802 const int g = *s++; 00803 const int r = *s++; 00804 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 00805 } 00806 } 00807 00808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) 00809 { 00810 const uint8_t *s = src; 00811 const uint8_t *end; 00812 #if HAVE_MMX 00813 const uint8_t *mm_end; 00814 #endif 00815 uint16_t *d = (uint16_t *)dst; 00816 end = s + src_size; 00817 #if HAVE_MMX 00818 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00819 __asm__ volatile( 00820 "movq %0, %%mm7 \n\t" 00821 "movq %1, %%mm6 \n\t" 00822 ::"m"(red_15mask),"m"(green_15mask)); 00823 mm_end = end - 15; 00824 while (s < mm_end) { 00825 __asm__ volatile( 00826 PREFETCH" 32%1 \n\t" 00827 "movd %1, %%mm0 \n\t" 00828 "movd 3%1, %%mm3 \n\t" 00829 "punpckldq 6%1, %%mm0 \n\t" 00830 "punpckldq 9%1, %%mm3 \n\t" 00831 "movq %%mm0, %%mm1 \n\t" 00832 "movq %%mm0, %%mm2 \n\t" 00833 "movq %%mm3, %%mm4 \n\t" 00834 "movq %%mm3, %%mm5 \n\t" 00835 "psllq $7, %%mm0 \n\t" 00836 "psllq $7, %%mm3 \n\t" 00837 "pand %%mm7, %%mm0 \n\t" 00838 "pand %%mm7, %%mm3 \n\t" 00839 "psrlq $6, %%mm1 \n\t" 00840 "psrlq $6, %%mm4 \n\t" 00841 "pand %%mm6, %%mm1 \n\t" 00842 "pand %%mm6, %%mm4 \n\t" 00843 "psrlq $19, %%mm2 \n\t" 00844 "psrlq $19, %%mm5 \n\t" 00845 "pand %2, %%mm2 \n\t" 00846 "pand %2, %%mm5 \n\t" 00847 "por %%mm1, %%mm0 \n\t" 00848 "por %%mm4, %%mm3 \n\t" 00849 "por %%mm2, %%mm0 \n\t" 00850 "por %%mm5, %%mm3 \n\t" 00851 "psllq $16, %%mm3 \n\t" 00852 "por %%mm3, %%mm0 \n\t" 00853 MOVNTQ" %%mm0, %0 \n\t" 00854 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00855 d += 4; 00856 s += 12; 00857 } 00858 __asm__ volatile(SFENCE:::"memory"); 00859 __asm__ volatile(EMMS:::"memory"); 00860 #endif 00861 while (s < end) { 00862 const int r = *s++; 00863 const int g = *s++; 00864 const int b = *s++; 00865 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 00866 } 00867 } 00868 00869 /* 00870 I use less accurate approximation here by simply left-shifting the input 00871 value and filling the low order bits with zeroes. This method improves PNG 00872 compression but this scheme cannot reproduce white exactly, since it does 00873 not generate an all-ones maximum value; the net effect is to darken the 00874 image slightly. 00875 00876 The better method should be "left bit replication": 00877 00878 4 3 2 1 0 00879 --------- 00880 1 1 0 1 1 00881 00882 7 6 5 4 3 2 1 0 00883 ---------------- 00884 1 1 0 1 1 1 1 0 00885 |=======| |===| 00886 | leftmost bits repeated to fill open bits 00887 | 00888 original bits 00889 */ 00890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) 00891 { 00892 const uint16_t *end; 00893 #if HAVE_MMX 00894 const uint16_t *mm_end; 00895 #endif 00896 uint8_t *d = dst; 00897 const uint16_t *s = (const uint16_t*)src; 00898 end = s + src_size/2; 00899 #if HAVE_MMX 00900 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00901 mm_end = end - 7; 00902 while (s < mm_end) { 00903 __asm__ volatile( 00904 PREFETCH" 32%1 \n\t" 00905 "movq %1, %%mm0 \n\t" 00906 "movq %1, %%mm1 \n\t" 00907 "movq %1, %%mm2 \n\t" 00908 "pand %2, %%mm0 \n\t" 00909 "pand %3, %%mm1 \n\t" 00910 "pand %4, %%mm2 \n\t" 00911 "psllq $3, %%mm0 \n\t" 00912 "psrlq $2, %%mm1 \n\t" 00913 "psrlq $7, %%mm2 \n\t" 00914 "movq %%mm0, %%mm3 \n\t" 00915 "movq %%mm1, %%mm4 \n\t" 00916 "movq %%mm2, %%mm5 \n\t" 00917 "punpcklwd %5, %%mm0 \n\t" 00918 "punpcklwd %5, %%mm1 \n\t" 00919 "punpcklwd %5, %%mm2 \n\t" 00920 "punpckhwd %5, %%mm3 \n\t" 00921 "punpckhwd %5, %%mm4 \n\t" 00922 "punpckhwd %5, %%mm5 \n\t" 00923 "psllq $8, %%mm1 \n\t" 00924 "psllq $16, %%mm2 \n\t" 00925 "por %%mm1, %%mm0 \n\t" 00926 "por %%mm2, %%mm0 \n\t" 00927 "psllq $8, %%mm4 \n\t" 00928 "psllq $16, %%mm5 \n\t" 00929 "por %%mm4, %%mm3 \n\t" 00930 "por %%mm5, %%mm3 \n\t" 00931 00932 "movq %%mm0, %%mm6 \n\t" 00933 "movq %%mm3, %%mm7 \n\t" 00934 00935 "movq 8%1, %%mm0 \n\t" 00936 "movq 8%1, %%mm1 \n\t" 00937 "movq 8%1, %%mm2 \n\t" 00938 "pand %2, %%mm0 \n\t" 00939 "pand %3, %%mm1 \n\t" 00940 "pand %4, %%mm2 \n\t" 00941 "psllq $3, %%mm0 \n\t" 00942 "psrlq $2, %%mm1 \n\t" 00943 "psrlq $7, %%mm2 \n\t" 00944 "movq %%mm0, %%mm3 \n\t" 00945 "movq %%mm1, %%mm4 \n\t" 00946 "movq %%mm2, %%mm5 \n\t" 00947 "punpcklwd %5, %%mm0 \n\t" 00948 "punpcklwd %5, %%mm1 \n\t" 00949 "punpcklwd %5, %%mm2 \n\t" 00950 "punpckhwd %5, %%mm3 \n\t" 00951 "punpckhwd %5, %%mm4 \n\t" 00952 "punpckhwd %5, %%mm5 \n\t" 00953 "psllq $8, %%mm1 \n\t" 00954 "psllq $16, %%mm2 \n\t" 00955 "por %%mm1, %%mm0 \n\t" 00956 "por %%mm2, %%mm0 \n\t" 00957 "psllq $8, %%mm4 \n\t" 00958 "psllq $16, %%mm5 \n\t" 00959 "por %%mm4, %%mm3 \n\t" 00960 "por %%mm5, %%mm3 \n\t" 00961 00962 :"=m"(*d) 00963 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) 00964 :"memory"); 00965 /* borrowed 32 to 24 */ 00966 __asm__ volatile( 00967 "movq %%mm0, %%mm4 \n\t" 00968 "movq %%mm3, %%mm5 \n\t" 00969 "movq %%mm6, %%mm0 \n\t" 00970 "movq %%mm7, %%mm1 \n\t" 00971 00972 "movq %%mm4, %%mm6 \n\t" 00973 "movq %%mm5, %%mm7 \n\t" 00974 "movq %%mm0, %%mm2 \n\t" 00975 "movq %%mm1, %%mm3 \n\t" 00976 00977 STORE_BGR24_MMX 00978 00979 :"=m"(*d) 00980 :"m"(*s) 00981 :"memory"); 00982 d += 24; 00983 s += 8; 00984 } 00985 __asm__ volatile(SFENCE:::"memory"); 00986 __asm__ volatile(EMMS:::"memory"); 00987 #endif 00988 while (s < end) { 00989 register uint16_t bgr; 00990 bgr = *s++; 00991 *d++ = (bgr&0x1F)<<3; 00992 *d++ = (bgr&0x3E0)>>2; 00993 *d++ = (bgr&0x7C00)>>7; 00994 } 00995 } 00996 00997 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) 00998 { 00999 const uint16_t *end; 01000 #if HAVE_MMX 01001 const uint16_t *mm_end; 01002 #endif 01003 uint8_t *d = (uint8_t *)dst; 01004 const uint16_t *s = (const uint16_t *)src; 01005 end = s + src_size/2; 01006 #if HAVE_MMX 01007 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 01008 mm_end = end - 7; 01009 while (s < mm_end) { 01010 __asm__ volatile( 01011 PREFETCH" 32%1 \n\t" 01012 "movq %1, %%mm0 \n\t" 01013 "movq %1, %%mm1 \n\t" 01014 "movq %1, %%mm2 \n\t" 01015 "pand %2, %%mm0 \n\t" 01016 "pand %3, %%mm1 \n\t" 01017 "pand %4, %%mm2 \n\t" 01018 "psllq $3, %%mm0 \n\t" 01019 "psrlq $3, %%mm1 \n\t" 01020 "psrlq $8, %%mm2 \n\t" 01021 "movq %%mm0, %%mm3 \n\t" 01022 "movq %%mm1, %%mm4 \n\t" 01023 "movq %%mm2, %%mm5 \n\t" 01024 "punpcklwd %5, %%mm0 \n\t" 01025 "punpcklwd %5, %%mm1 \n\t" 01026 "punpcklwd %5, %%mm2 \n\t" 01027 "punpckhwd %5, %%mm3 \n\t" 01028 "punpckhwd %5, %%mm4 \n\t" 01029 "punpckhwd %5, %%mm5 \n\t" 01030 "psllq $8, %%mm1 \n\t" 01031 "psllq $16, %%mm2 \n\t" 01032 "por %%mm1, %%mm0 \n\t" 01033 "por %%mm2, %%mm0 \n\t" 01034 "psllq $8, %%mm4 \n\t" 01035 "psllq $16, %%mm5 \n\t" 01036 "por %%mm4, %%mm3 \n\t" 01037 "por %%mm5, %%mm3 \n\t" 01038 01039 "movq %%mm0, %%mm6 \n\t" 01040 "movq %%mm3, %%mm7 \n\t" 01041 01042 "movq 8%1, %%mm0 \n\t" 01043 "movq 8%1, %%mm1 \n\t" 01044 "movq 8%1, %%mm2 \n\t" 01045 "pand %2, %%mm0 \n\t" 01046 "pand %3, %%mm1 \n\t" 01047 "pand %4, %%mm2 \n\t" 01048 "psllq $3, %%mm0 \n\t" 01049 "psrlq $3, %%mm1 \n\t" 01050 "psrlq $8, %%mm2 \n\t" 01051 "movq %%mm0, %%mm3 \n\t" 01052 "movq %%mm1, %%mm4 \n\t" 01053 "movq %%mm2, %%mm5 \n\t" 01054 "punpcklwd %5, %%mm0 \n\t" 01055 "punpcklwd %5, %%mm1 \n\t" 01056 "punpcklwd %5, %%mm2 \n\t" 01057 "punpckhwd %5, %%mm3 \n\t" 01058 "punpckhwd %5, %%mm4 \n\t" 01059 "punpckhwd %5, %%mm5 \n\t" 01060 "psllq $8, %%mm1 \n\t" 01061 "psllq $16, %%mm2 \n\t" 01062 "por %%mm1, %%mm0 \n\t" 01063 "por %%mm2, %%mm0 \n\t" 01064 "psllq $8, %%mm4 \n\t" 01065 "psllq $16, %%mm5 \n\t" 01066 "por %%mm4, %%mm3 \n\t" 01067 "por %%mm5, %%mm3 \n\t" 01068 :"=m"(*d) 01069 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) 01070 :"memory"); 01071 /* borrowed 32 to 24 */ 01072 __asm__ volatile( 01073 "movq %%mm0, %%mm4 \n\t" 01074 "movq %%mm3, %%mm5 \n\t" 01075 "movq %%mm6, %%mm0 \n\t" 01076 "movq %%mm7, %%mm1 \n\t" 01077 01078 "movq %%mm4, %%mm6 \n\t" 01079 "movq %%mm5, %%mm7 \n\t" 01080 "movq %%mm0, %%mm2 \n\t" 01081 "movq %%mm1, %%mm3 \n\t" 01082 01083 STORE_BGR24_MMX 01084 01085 :"=m"(*d) 01086 :"m"(*s) 01087 :"memory"); 01088 d += 24; 01089 s += 8; 01090 } 01091 __asm__ volatile(SFENCE:::"memory"); 01092 __asm__ volatile(EMMS:::"memory"); 01093 #endif 01094 while (s < end) { 01095 register uint16_t bgr; 01096 bgr = *s++; 01097 *d++ = (bgr&0x1F)<<3; 01098 *d++ = (bgr&0x7E0)>>3; 01099 *d++ = (bgr&0xF800)>>8; 01100 } 01101 } 01102 01103 /* 01104 * mm0 = 00 B3 00 B2 00 B1 00 B0 01105 * mm1 = 00 G3 00 G2 00 G1 00 G0 01106 * mm2 = 00 R3 00 R2 00 R1 00 R0 01107 * mm6 = FF FF FF FF FF FF FF FF 01108 * mm7 = 00 00 00 00 00 00 00 00 01109 */ 01110 #define PACK_RGB32 \ 01111 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ 01112 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ 01113 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ 01114 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ 01115 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ 01116 "movq %%mm0, %%mm3 \n\t" \ 01117 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ 01118 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ 01119 MOVNTQ" %%mm0, %0 \n\t" \ 01120 MOVNTQ" %%mm3, 8%0 \n\t" \ 01121 01122 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) 01123 { 01124 const uint16_t *end; 01125 #if HAVE_MMX 01126 const uint16_t *mm_end; 01127 #endif 01128 uint8_t *d = dst; 01129 const uint16_t *s = (const uint16_t *)src; 01130 end = s + src_size/2; 01131 #if HAVE_MMX 01132 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 01133 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); 01134 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 01135 mm_end = end - 3; 01136 while (s < mm_end) { 01137 __asm__ volatile( 01138 PREFETCH" 32%1 \n\t" 01139 "movq %1, %%mm0 \n\t" 01140 "movq %1, %%mm1 \n\t" 01141 "movq %1, %%mm2 \n\t" 01142 "pand %2, %%mm0 \n\t" 01143 "pand %3, %%mm1 \n\t" 01144 "pand %4, %%mm2 \n\t" 01145 "psllq $3, %%mm0 \n\t" 01146 "psrlq $2, %%mm1 \n\t" 01147 "psrlq $7, %%mm2 \n\t" 01148 PACK_RGB32 01149 :"=m"(*d) 01150 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) 01151 :"memory"); 01152 d += 16; 01153 s += 4; 01154 } 01155 __asm__ volatile(SFENCE:::"memory"); 01156 __asm__ volatile(EMMS:::"memory"); 01157 #endif 01158 while (s < end) { 01159 register uint16_t bgr; 01160 bgr = *s++; 01161 #if HAVE_BIGENDIAN 01162 *d++ = 255; 01163 *d++ = (bgr&0x7C00)>>7; 01164 *d++ = (bgr&0x3E0)>>2; 01165 *d++ = (bgr&0x1F)<<3; 01166 #else 01167 *d++ = (bgr&0x1F)<<3; 01168 *d++ = (bgr&0x3E0)>>2; 01169 *d++ = (bgr&0x7C00)>>7; 01170 *d++ = 255; 01171 #endif 01172 } 01173 } 01174 01175 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) 01176 { 01177 const uint16_t *end; 01178 #if HAVE_MMX 01179 const uint16_t *mm_end; 01180 #endif 01181 uint8_t *d = dst; 01182 const uint16_t *s = (const uint16_t*)src; 01183 end = s + src_size/2; 01184 #if HAVE_MMX 01185 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 01186 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); 01187 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 01188 mm_end = end - 3; 01189 while (s < mm_end) { 01190 __asm__ volatile( 01191 PREFETCH" 32%1 \n\t" 01192 "movq %1, %%mm0 \n\t" 01193 "movq %1, %%mm1 \n\t" 01194 "movq %1, %%mm2 \n\t" 01195 "pand %2, %%mm0 \n\t" 01196 "pand %3, %%mm1 \n\t" 01197 "pand %4, %%mm2 \n\t" 01198 "psllq $3, %%mm0 \n\t" 01199 "psrlq $3, %%mm1 \n\t" 01200 "psrlq $8, %%mm2 \n\t" 01201 PACK_RGB32 01202 :"=m"(*d) 01203 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) 01204 :"memory"); 01205 d += 16; 01206 s += 4; 01207 } 01208 __asm__ volatile(SFENCE:::"memory"); 01209 __asm__ volatile(EMMS:::"memory"); 01210 #endif 01211 while (s < end) { 01212 register uint16_t bgr; 01213 bgr = *s++; 01214 #if HAVE_BIGENDIAN 01215 *d++ = 255; 01216 *d++ = (bgr&0xF800)>>8; 01217 *d++ = (bgr&0x7E0)>>3; 01218 *d++ = (bgr&0x1F)<<3; 01219 #else 01220 *d++ = (bgr&0x1F)<<3; 01221 *d++ = (bgr&0x7E0)>>3; 01222 *d++ = (bgr&0xF800)>>8; 01223 *d++ = 255; 01224 #endif 01225 } 01226 } 01227 01228 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) 01229 { 01230 x86_reg idx = 15 - src_size; 01231 const uint8_t *s = src-idx; 01232 uint8_t *d = dst-idx; 01233 #if HAVE_MMX 01234 __asm__ volatile( 01235 "test %0, %0 \n\t" 01236 "jns 2f \n\t" 01237 PREFETCH" (%1, %0) \n\t" 01238 "movq %3, %%mm7 \n\t" 01239 "pxor %4, %%mm7 \n\t" 01240 "movq %%mm7, %%mm6 \n\t" 01241 "pxor %5, %%mm7 \n\t" 01242 ASMALIGN(4) 01243 "1: \n\t" 01244 PREFETCH" 32(%1, %0) \n\t" 01245 "movq (%1, %0), %%mm0 \n\t" 01246 "movq 8(%1, %0), %%mm1 \n\t" 01247 # if HAVE_MMX2 01248 "pshufw $177, %%mm0, %%mm3 \n\t" 01249 "pshufw $177, %%mm1, %%mm5 \n\t" 01250 "pand %%mm7, %%mm0 \n\t" 01251 "pand %%mm6, %%mm3 \n\t" 01252 "pand %%mm7, %%mm1 \n\t" 01253 "pand %%mm6, %%mm5 \n\t" 01254 "por %%mm3, %%mm0 \n\t" 01255 "por %%mm5, %%mm1 \n\t" 01256 # else 01257 "movq %%mm0, %%mm2 \n\t" 01258 "movq %%mm1, %%mm4 \n\t" 01259 "pand %%mm7, %%mm0 \n\t" 01260 "pand %%mm6, %%mm2 \n\t" 01261 "pand %%mm7, %%mm1 \n\t" 01262 "pand %%mm6, %%mm4 \n\t" 01263 "movq %%mm2, %%mm3 \n\t" 01264 "movq %%mm4, %%mm5 \n\t" 01265 "pslld $16, %%mm2 \n\t" 01266 "psrld $16, %%mm3 \n\t" 01267 "pslld $16, %%mm4 \n\t" 01268 "psrld $16, %%mm5 \n\t" 01269 "por %%mm2, %%mm0 \n\t" 01270 "por %%mm4, %%mm1 \n\t" 01271 "por %%mm3, %%mm0 \n\t" 01272 "por %%mm5, %%mm1 \n\t" 01273 # endif 01274 MOVNTQ" %%mm0, (%2, %0) \n\t" 01275 MOVNTQ" %%mm1, 8(%2, %0) \n\t" 01276 "add $16, %0 \n\t" 01277 "js 1b \n\t" 01278 SFENCE" \n\t" 01279 EMMS" \n\t" 01280 "2: \n\t" 01281 : "+&r"(idx) 01282 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) 01283 : "memory"); 01284 #endif 01285 for (; idx<15; idx+=4) { 01286 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; 01287 v &= 0xff00ff; 01288 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); 01289 } 01290 } 01291 01292 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) 01293 { 01294 unsigned i; 01295 #if HAVE_MMX 01296 x86_reg mmx_size= 23 - src_size; 01297 __asm__ volatile ( 01298 "test %%"REG_a", %%"REG_a" \n\t" 01299 "jns 2f \n\t" 01300 "movq "MANGLE(mask24r)", %%mm5 \n\t" 01301 "movq "MANGLE(mask24g)", %%mm6 \n\t" 01302 "movq "MANGLE(mask24b)", %%mm7 \n\t" 01303 ASMALIGN(4) 01304 "1: \n\t" 01305 PREFETCH" 32(%1, %%"REG_a") \n\t" 01306 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 01307 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG 01308 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B 01309 "psllq $16, %%mm0 \n\t" // 00 BGR BGR 01310 "pand %%mm5, %%mm0 \n\t" 01311 "pand %%mm6, %%mm1 \n\t" 01312 "pand %%mm7, %%mm2 \n\t" 01313 "por %%mm0, %%mm1 \n\t" 01314 "por %%mm2, %%mm1 \n\t" 01315 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 01316 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG 01317 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B 01318 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR 01319 "pand %%mm7, %%mm0 \n\t" 01320 "pand %%mm5, %%mm1 \n\t" 01321 "pand %%mm6, %%mm2 \n\t" 01322 "por %%mm0, %%mm1 \n\t" 01323 "por %%mm2, %%mm1 \n\t" 01324 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B 01325 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R 01326 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR 01327 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG 01328 "pand %%mm6, %%mm0 \n\t" 01329 "pand %%mm7, %%mm1 \n\t" 01330 "pand %%mm5, %%mm2 \n\t" 01331 "por %%mm0, %%mm1 \n\t" 01332 "por %%mm2, %%mm1 \n\t" 01333 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" 01334 "add $24, %%"REG_a" \n\t" 01335 " js 1b \n\t" 01336 "2: \n\t" 01337 : "+a" (mmx_size) 01338 : "r" (src-mmx_size), "r"(dst-mmx_size) 01339 ); 01340 01341 __asm__ volatile(SFENCE:::"memory"); 01342 __asm__ volatile(EMMS:::"memory"); 01343 01344 if (mmx_size==23) return; //finished, was multiple of 8 01345 01346 src+= src_size; 01347 dst+= src_size; 01348 src_size= 23-mmx_size; 01349 src-= src_size; 01350 dst-= src_size; 01351 #endif 01352 for (i=0; i<src_size; i+=3) { 01353 register uint8_t x; 01354 x = src[i + 2]; 01355 dst[i + 1] = src[i + 1]; 01356 dst[i + 2] = src[i + 0]; 01357 dst[i + 0] = x; 01358 } 01359 } 01360 01361 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01362 long width, long height, 01363 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) 01364 { 01365 long y; 01366 const x86_reg chromWidth= width>>1; 01367 for (y=0; y<height; y++) { 01368 #if HAVE_MMX 01369 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 01370 __asm__ volatile( 01371 "xor %%"REG_a", %%"REG_a" \n\t" 01372 ASMALIGN(4) 01373 "1: \n\t" 01374 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 01375 PREFETCH" 32(%2, %%"REG_a") \n\t" 01376 PREFETCH" 32(%3, %%"REG_a") \n\t" 01377 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 01378 "movq %%mm0, %%mm2 \n\t" // U(0) 01379 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 01380 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01381 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 01382 01383 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 01384 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 01385 "movq %%mm3, %%mm4 \n\t" // Y(0) 01386 "movq %%mm5, %%mm6 \n\t" // Y(8) 01387 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 01388 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 01389 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 01390 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 01391 01392 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" 01393 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 01394 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" 01395 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 01396 01397 "add $8, %%"REG_a" \n\t" 01398 "cmp %4, %%"REG_a" \n\t" 01399 " jb 1b \n\t" 01400 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 01401 : "%"REG_a 01402 ); 01403 #else 01404 01405 #if ARCH_ALPHA && HAVE_MVI 01406 #define pl2yuy2(n) \ 01407 y1 = yc[n]; \ 01408 y2 = yc2[n]; \ 01409 u = uc[n]; \ 01410 v = vc[n]; \ 01411 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ 01412 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ 01413 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ 01414 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ 01415 yuv1 = (u << 8) + (v << 24); \ 01416 yuv2 = yuv1 + y2; \ 01417 yuv1 += y1; \ 01418 qdst[n] = yuv1; \ 01419 qdst2[n] = yuv2; 01420 01421 int i; 01422 uint64_t *qdst = (uint64_t *) dst; 01423 uint64_t *qdst2 = (uint64_t *) (dst + dstStride); 01424 const uint32_t *yc = (uint32_t *) ysrc; 01425 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); 01426 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; 01427 for (i = 0; i < chromWidth; i += 8) { 01428 uint64_t y1, y2, yuv1, yuv2; 01429 uint64_t u, v; 01430 /* Prefetch */ 01431 __asm__("ldq $31,64(%0)" :: "r"(yc)); 01432 __asm__("ldq $31,64(%0)" :: "r"(yc2)); 01433 __asm__("ldq $31,64(%0)" :: "r"(uc)); 01434 __asm__("ldq $31,64(%0)" :: "r"(vc)); 01435 01436 pl2yuy2(0); 01437 pl2yuy2(1); 01438 pl2yuy2(2); 01439 pl2yuy2(3); 01440 01441 yc += 4; 01442 yc2 += 4; 01443 uc += 4; 01444 vc += 4; 01445 qdst += 4; 01446 qdst2 += 4; 01447 } 01448 y++; 01449 ysrc += lumStride; 01450 dst += dstStride; 01451 01452 #elif HAVE_FAST_64BIT 01453 int i; 01454 uint64_t *ldst = (uint64_t *) dst; 01455 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 01456 for (i = 0; i < chromWidth; i += 2) { 01457 uint64_t k, l; 01458 k = yc[0] + (uc[0] << 8) + 01459 (yc[1] << 16) + (vc[0] << 24); 01460 l = yc[2] + (uc[1] << 8) + 01461 (yc[3] << 16) + (vc[1] << 24); 01462 *ldst++ = k + (l << 32); 01463 yc += 4; 01464 uc += 2; 01465 vc += 2; 01466 } 01467 01468 #else 01469 int i, *idst = (int32_t *) dst; 01470 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 01471 for (i = 0; i < chromWidth; i++) { 01472 #if HAVE_BIGENDIAN 01473 *idst++ = (yc[0] << 24)+ (uc[0] << 16) + 01474 (yc[1] << 8) + (vc[0] << 0); 01475 #else 01476 *idst++ = yc[0] + (uc[0] << 8) + 01477 (yc[1] << 16) + (vc[0] << 24); 01478 #endif 01479 yc += 2; 01480 uc++; 01481 vc++; 01482 } 01483 #endif 01484 #endif 01485 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 01486 usrc += chromStride; 01487 vsrc += chromStride; 01488 } 01489 ysrc += lumStride; 01490 dst += dstStride; 01491 } 01492 #if HAVE_MMX 01493 __asm__(EMMS" \n\t" 01494 SFENCE" \n\t" 01495 :::"memory"); 01496 #endif 01497 } 01498 01503 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01504 long width, long height, 01505 long lumStride, long chromStride, long dstStride) 01506 { 01507 //FIXME interpolate chroma 01508 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 01509 } 01510 01511 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01512 long width, long height, 01513 long lumStride, long chromStride, long dstStride, long vertLumPerChroma) 01514 { 01515 long y; 01516 const x86_reg chromWidth= width>>1; 01517 for (y=0; y<height; y++) { 01518 #if HAVE_MMX 01519 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 01520 __asm__ volatile( 01521 "xor %%"REG_a", %%"REG_a" \n\t" 01522 ASMALIGN(4) 01523 "1: \n\t" 01524 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 01525 PREFETCH" 32(%2, %%"REG_a") \n\t" 01526 PREFETCH" 32(%3, %%"REG_a") \n\t" 01527 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 01528 "movq %%mm0, %%mm2 \n\t" // U(0) 01529 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 01530 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01531 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 01532 01533 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 01534 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 01535 "movq %%mm0, %%mm4 \n\t" // Y(0) 01536 "movq %%mm2, %%mm6 \n\t" // Y(8) 01537 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) 01538 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) 01539 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) 01540 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) 01541 01542 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" 01543 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 01544 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" 01545 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 01546 01547 "add $8, %%"REG_a" \n\t" 01548 "cmp %4, %%"REG_a" \n\t" 01549 " jb 1b \n\t" 01550 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 01551 : "%"REG_a 01552 ); 01553 #else 01554 //FIXME adapt the Alpha ASM code from yv12->yuy2 01555 01556 #if HAVE_FAST_64BIT 01557 int i; 01558 uint64_t *ldst = (uint64_t *) dst; 01559 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 01560 for (i = 0; i < chromWidth; i += 2) { 01561 uint64_t k, l; 01562 k = uc[0] + (yc[0] << 8) + 01563 (vc[0] << 16) + (yc[1] << 24); 01564 l = uc[1] + (yc[2] << 8) + 01565 (vc[1] << 16) + (yc[3] << 24); 01566 *ldst++ = k + (l << 32); 01567 yc += 4; 01568 uc += 2; 01569 vc += 2; 01570 } 01571 01572 #else 01573 int i, *idst = (int32_t *) dst; 01574 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; 01575 for (i = 0; i < chromWidth; i++) { 01576 #if HAVE_BIGENDIAN 01577 *idst++ = (uc[0] << 24)+ (yc[0] << 16) + 01578 (vc[0] << 8) + (yc[1] << 0); 01579 #else 01580 *idst++ = uc[0] + (yc[0] << 8) + 01581 (vc[0] << 16) + (yc[1] << 24); 01582 #endif 01583 yc += 2; 01584 uc++; 01585 vc++; 01586 } 01587 #endif 01588 #endif 01589 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 01590 usrc += chromStride; 01591 vsrc += chromStride; 01592 } 01593 ysrc += lumStride; 01594 dst += dstStride; 01595 } 01596 #if HAVE_MMX 01597 __asm__(EMMS" \n\t" 01598 SFENCE" \n\t" 01599 :::"memory"); 01600 #endif 01601 } 01602 01607 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01608 long width, long height, 01609 long lumStride, long chromStride, long dstStride) 01610 { 01611 //FIXME interpolate chroma 01612 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 01613 } 01614 01618 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01619 long width, long height, 01620 long lumStride, long chromStride, long dstStride) 01621 { 01622 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 01623 } 01624 01628 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01629 long width, long height, 01630 long lumStride, long chromStride, long dstStride) 01631 { 01632 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 01633 } 01634 01639 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01640 long width, long height, 01641 long lumStride, long chromStride, long srcStride) 01642 { 01643 long y; 01644 const x86_reg chromWidth= width>>1; 01645 for (y=0; y<height; y+=2) { 01646 #if HAVE_MMX 01647 __asm__ volatile( 01648 "xor %%"REG_a", %%"REG_a" \n\t" 01649 "pcmpeqw %%mm7, %%mm7 \n\t" 01650 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 01651 ASMALIGN(4) 01652 "1: \n\t" 01653 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01654 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 01655 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 01656 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 01657 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 01658 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 01659 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 01660 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 01661 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 01662 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01663 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 01664 01665 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 01666 01667 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) 01668 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) 01669 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 01670 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 01671 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 01672 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 01673 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 01674 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 01675 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 01676 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 01677 01678 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 01679 01680 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 01681 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 01682 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 01683 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 01684 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 01685 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 01686 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 01687 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 01688 01689 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 01690 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 01691 01692 "add $8, %%"REG_a" \n\t" 01693 "cmp %4, %%"REG_a" \n\t" 01694 " jb 1b \n\t" 01695 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01696 : "memory", "%"REG_a 01697 ); 01698 01699 ydst += lumStride; 01700 src += srcStride; 01701 01702 __asm__ volatile( 01703 "xor %%"REG_a", %%"REG_a" \n\t" 01704 ASMALIGN(4) 01705 "1: \n\t" 01706 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01707 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 01708 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 01709 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 01710 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 01711 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 01712 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 01713 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 01714 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 01715 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 01716 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 01717 01718 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 01719 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 01720 01721 "add $8, %%"REG_a" \n\t" 01722 "cmp %4, %%"REG_a" \n\t" 01723 " jb 1b \n\t" 01724 01725 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01726 : "memory", "%"REG_a 01727 ); 01728 #else 01729 long i; 01730 for (i=0; i<chromWidth; i++) { 01731 ydst[2*i+0] = src[4*i+0]; 01732 udst[i] = src[4*i+1]; 01733 ydst[2*i+1] = src[4*i+2]; 01734 vdst[i] = src[4*i+3]; 01735 } 01736 ydst += lumStride; 01737 src += srcStride; 01738 01739 for (i=0; i<chromWidth; i++) { 01740 ydst[2*i+0] = src[4*i+0]; 01741 ydst[2*i+1] = src[4*i+2]; 01742 } 01743 #endif 01744 udst += chromStride; 01745 vdst += chromStride; 01746 ydst += lumStride; 01747 src += srcStride; 01748 } 01749 #if HAVE_MMX 01750 __asm__ volatile(EMMS" \n\t" 01751 SFENCE" \n\t" 01752 :::"memory"); 01753 #endif 01754 } 01755 01756 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, 01757 uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01758 long width, long height, long lumStride, long chromStride) 01759 { 01760 /* Y Plane */ 01761 memcpy(ydst, ysrc, width*height); 01762 01763 /* XXX: implement upscaling for U,V */ 01764 } 01765 01766 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) 01767 { 01768 long x,y; 01769 01770 dst[0]= src[0]; 01771 01772 // first line 01773 for (x=0; x<srcWidth-1; x++) { 01774 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 01775 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 01776 } 01777 dst[2*srcWidth-1]= src[srcWidth-1]; 01778 01779 dst+= dstStride; 01780 01781 for (y=1; y<srcHeight; y++) { 01782 #if HAVE_MMX2 || HAVE_AMD3DNOW 01783 const x86_reg mmxSize= srcWidth&~15; 01784 __asm__ volatile( 01785 "mov %4, %%"REG_a" \n\t" 01786 "1: \n\t" 01787 "movq (%0, %%"REG_a"), %%mm0 \n\t" 01788 "movq (%1, %%"REG_a"), %%mm1 \n\t" 01789 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" 01790 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" 01791 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" 01792 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" 01793 PAVGB" %%mm0, %%mm5 \n\t" 01794 PAVGB" %%mm0, %%mm3 \n\t" 01795 PAVGB" %%mm0, %%mm5 \n\t" 01796 PAVGB" %%mm0, %%mm3 \n\t" 01797 PAVGB" %%mm1, %%mm4 \n\t" 01798 PAVGB" %%mm1, %%mm2 \n\t" 01799 PAVGB" %%mm1, %%mm4 \n\t" 01800 PAVGB" %%mm1, %%mm2 \n\t" 01801 "movq %%mm5, %%mm7 \n\t" 01802 "movq %%mm4, %%mm6 \n\t" 01803 "punpcklbw %%mm3, %%mm5 \n\t" 01804 "punpckhbw %%mm3, %%mm7 \n\t" 01805 "punpcklbw %%mm2, %%mm4 \n\t" 01806 "punpckhbw %%mm2, %%mm6 \n\t" 01807 #if 1 01808 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" 01809 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" 01810 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" 01811 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" 01812 #else 01813 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" 01814 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t" 01815 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" 01816 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" 01817 #endif 01818 "add $8, %%"REG_a" \n\t" 01819 " js 1b \n\t" 01820 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), 01821 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 01822 "g" (-mmxSize) 01823 : "%"REG_a 01824 01825 ); 01826 #else 01827 const x86_reg mmxSize=1; 01828 #endif 01829 dst[0 ]= (3*src[0] + src[srcStride])>>2; 01830 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; 01831 01832 for (x=mmxSize-1; x<srcWidth-1; x++) { 01833 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; 01834 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; 01835 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; 01836 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; 01837 } 01838 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; 01839 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; 01840 01841 dst+=dstStride*2; 01842 src+=srcStride; 01843 } 01844 01845 // last line 01846 #if 1 01847 dst[0]= src[0]; 01848 01849 for (x=0; x<srcWidth-1; x++) { 01850 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 01851 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 01852 } 01853 dst[2*srcWidth-1]= src[srcWidth-1]; 01854 #else 01855 for (x=0; x<srcWidth; x++) { 01856 dst[2*x+0]= 01857 dst[2*x+1]= src[x]; 01858 } 01859 #endif 01860 01861 #if HAVE_MMX 01862 __asm__ volatile(EMMS" \n\t" 01863 SFENCE" \n\t" 01864 :::"memory"); 01865 #endif 01866 } 01867 01874 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01875 long width, long height, 01876 long lumStride, long chromStride, long srcStride) 01877 { 01878 long y; 01879 const x86_reg chromWidth= width>>1; 01880 for (y=0; y<height; y+=2) { 01881 #if HAVE_MMX 01882 __asm__ volatile( 01883 "xor %%"REG_a", %%"REG_a" \n\t" 01884 "pcmpeqw %%mm7, %%mm7 \n\t" 01885 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 01886 ASMALIGN(4) 01887 "1: \n\t" 01888 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01889 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) 01890 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) 01891 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) 01892 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) 01893 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) 01894 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) 01895 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 01896 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 01897 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01898 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 01899 01900 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 01901 01902 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) 01903 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) 01904 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) 01905 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) 01906 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) 01907 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) 01908 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 01909 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 01910 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 01911 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 01912 01913 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 01914 01915 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 01916 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 01917 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 01918 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 01919 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 01920 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 01921 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 01922 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 01923 01924 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 01925 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 01926 01927 "add $8, %%"REG_a" \n\t" 01928 "cmp %4, %%"REG_a" \n\t" 01929 " jb 1b \n\t" 01930 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01931 : "memory", "%"REG_a 01932 ); 01933 01934 ydst += lumStride; 01935 src += srcStride; 01936 01937 __asm__ volatile( 01938 "xor %%"REG_a", %%"REG_a" \n\t" 01939 ASMALIGN(4) 01940 "1: \n\t" 01941 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01942 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 01943 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 01944 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 01945 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 01946 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 01947 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 01948 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 01949 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 01950 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 01951 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 01952 01953 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 01954 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 01955 01956 "add $8, %%"REG_a" \n\t" 01957 "cmp %4, %%"REG_a" \n\t" 01958 " jb 1b \n\t" 01959 01960 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01961 : "memory", "%"REG_a 01962 ); 01963 #else 01964 long i; 01965 for (i=0; i<chromWidth; i++) { 01966 udst[i] = src[4*i+0]; 01967 ydst[2*i+0] = src[4*i+1]; 01968 vdst[i] = src[4*i+2]; 01969 ydst[2*i+1] = src[4*i+3]; 01970 } 01971 ydst += lumStride; 01972 src += srcStride; 01973 01974 for (i=0; i<chromWidth; i++) { 01975 ydst[2*i+0] = src[4*i+1]; 01976 ydst[2*i+1] = src[4*i+3]; 01977 } 01978 #endif 01979 udst += chromStride; 01980 vdst += chromStride; 01981 ydst += lumStride; 01982 src += srcStride; 01983 } 01984 #if HAVE_MMX 01985 __asm__ volatile(EMMS" \n\t" 01986 SFENCE" \n\t" 01987 :::"memory"); 01988 #endif 01989 } 01990 01998 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01999 long width, long height, 02000 long lumStride, long chromStride, long srcStride) 02001 { 02002 long y; 02003 const x86_reg chromWidth= width>>1; 02004 #if HAVE_MMX 02005 for (y=0; y<height-2; y+=2) { 02006 long i; 02007 for (i=0; i<2; i++) { 02008 __asm__ volatile( 02009 "mov %2, %%"REG_a" \n\t" 02010 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" 02011 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 02012 "pxor %%mm7, %%mm7 \n\t" 02013 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 02014 ASMALIGN(4) 02015 "1: \n\t" 02016 PREFETCH" 64(%0, %%"REG_d") \n\t" 02017 "movd (%0, %%"REG_d"), %%mm0 \n\t" 02018 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" 02019 "punpcklbw %%mm7, %%mm0 \n\t" 02020 "punpcklbw %%mm7, %%mm1 \n\t" 02021 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" 02022 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" 02023 "punpcklbw %%mm7, %%mm2 \n\t" 02024 "punpcklbw %%mm7, %%mm3 \n\t" 02025 "pmaddwd %%mm6, %%mm0 \n\t" 02026 "pmaddwd %%mm6, %%mm1 \n\t" 02027 "pmaddwd %%mm6, %%mm2 \n\t" 02028 "pmaddwd %%mm6, %%mm3 \n\t" 02029 #ifndef FAST_BGR2YV12 02030 "psrad $8, %%mm0 \n\t" 02031 "psrad $8, %%mm1 \n\t" 02032 "psrad $8, %%mm2 \n\t" 02033 "psrad $8, %%mm3 \n\t" 02034 #endif 02035 "packssdw %%mm1, %%mm0 \n\t" 02036 "packssdw %%mm3, %%mm2 \n\t" 02037 "pmaddwd %%mm5, %%mm0 \n\t" 02038 "pmaddwd %%mm5, %%mm2 \n\t" 02039 "packssdw %%mm2, %%mm0 \n\t" 02040 "psraw $7, %%mm0 \n\t" 02041 02042 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 02043 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" 02044 "punpcklbw %%mm7, %%mm4 \n\t" 02045 "punpcklbw %%mm7, %%mm1 \n\t" 02046 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" 02047 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" 02048 "punpcklbw %%mm7, %%mm2 \n\t" 02049 "punpcklbw %%mm7, %%mm3 \n\t" 02050 "pmaddwd %%mm6, %%mm4 \n\t" 02051 "pmaddwd %%mm6, %%mm1 \n\t" 02052 "pmaddwd %%mm6, %%mm2 \n\t" 02053 "pmaddwd %%mm6, %%mm3 \n\t" 02054 #ifndef FAST_BGR2YV12 02055 "psrad $8, %%mm4 \n\t" 02056 "psrad $8, %%mm1 \n\t" 02057 "psrad $8, %%mm2 \n\t" 02058 "psrad $8, %%mm3 \n\t" 02059 #endif 02060 "packssdw %%mm1, %%mm4 \n\t" 02061 "packssdw %%mm3, %%mm2 \n\t" 02062 "pmaddwd %%mm5, %%mm4 \n\t" 02063 "pmaddwd %%mm5, %%mm2 \n\t" 02064 "add $24, %%"REG_d" \n\t" 02065 "packssdw %%mm2, %%mm4 \n\t" 02066 "psraw $7, %%mm4 \n\t" 02067 02068 "packuswb %%mm4, %%mm0 \n\t" 02069 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" 02070 02071 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" 02072 "add $8, %%"REG_a" \n\t" 02073 " js 1b \n\t" 02074 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) 02075 : "%"REG_a, "%"REG_d 02076 ); 02077 ydst += lumStride; 02078 src += srcStride; 02079 } 02080 src -= srcStride*2; 02081 __asm__ volatile( 02082 "mov %4, %%"REG_a" \n\t" 02083 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 02084 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" 02085 "pxor %%mm7, %%mm7 \n\t" 02086 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 02087 "add %%"REG_d", %%"REG_d" \n\t" 02088 ASMALIGN(4) 02089 "1: \n\t" 02090 PREFETCH" 64(%0, %%"REG_d") \n\t" 02091 PREFETCH" 64(%1, %%"REG_d") \n\t" 02092 #if HAVE_MMX2 || HAVE_AMD3DNOW 02093 "movq (%0, %%"REG_d"), %%mm0 \n\t" 02094 "movq (%1, %%"REG_d"), %%mm1 \n\t" 02095 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" 02096 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" 02097 PAVGB" %%mm1, %%mm0 \n\t" 02098 PAVGB" %%mm3, %%mm2 \n\t" 02099 "movq %%mm0, %%mm1 \n\t" 02100 "movq %%mm2, %%mm3 \n\t" 02101 "psrlq $24, %%mm0 \n\t" 02102 "psrlq $24, %%mm2 \n\t" 02103 PAVGB" %%mm1, %%mm0 \n\t" 02104 PAVGB" %%mm3, %%mm2 \n\t" 02105 "punpcklbw %%mm7, %%mm0 \n\t" 02106 "punpcklbw %%mm7, %%mm2 \n\t" 02107 #else 02108 "movd (%0, %%"REG_d"), %%mm0 \n\t" 02109 "movd (%1, %%"REG_d"), %%mm1 \n\t" 02110 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" 02111 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" 02112 "punpcklbw %%mm7, %%mm0 \n\t" 02113 "punpcklbw %%mm7, %%mm1 \n\t" 02114 "punpcklbw %%mm7, %%mm2 \n\t" 02115 "punpcklbw %%mm7, %%mm3 \n\t" 02116 "paddw %%mm1, %%mm0 \n\t" 02117 "paddw %%mm3, %%mm2 \n\t" 02118 "paddw %%mm2, %%mm0 \n\t" 02119 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" 02120 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" 02121 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" 02122 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" 02123 "punpcklbw %%mm7, %%mm4 \n\t" 02124 "punpcklbw %%mm7, %%mm1 \n\t" 02125 "punpcklbw %%mm7, %%mm2 \n\t" 02126 "punpcklbw %%mm7, %%mm3 \n\t" 02127 "paddw %%mm1, %%mm4 \n\t" 02128 "paddw %%mm3, %%mm2 \n\t" 02129 "paddw %%mm4, %%mm2 \n\t" 02130 "psrlw $2, %%mm0 \n\t" 02131 "psrlw $2, %%mm2 \n\t" 02132 #endif 02133 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" 02134 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" 02135 02136 "pmaddwd %%mm0, %%mm1 \n\t" 02137 "pmaddwd %%mm2, %%mm3 \n\t" 02138 "pmaddwd %%mm6, %%mm0 \n\t" 02139 "pmaddwd %%mm6, %%mm2 \n\t" 02140 #ifndef FAST_BGR2YV12 02141 "psrad $8, %%mm0 \n\t" 02142 "psrad $8, %%mm1 \n\t" 02143 "psrad $8, %%mm2 \n\t" 02144 "psrad $8, %%mm3 \n\t" 02145 #endif 02146 "packssdw %%mm2, %%mm0 \n\t" 02147 "packssdw %%mm3, %%mm1 \n\t" 02148 "pmaddwd %%mm5, %%mm0 \n\t" 02149 "pmaddwd %%mm5, %%mm1 \n\t" 02150 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 02151 "psraw $7, %%mm0 \n\t" 02152 02153 #if HAVE_MMX2 || HAVE_AMD3DNOW 02154 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" 02155 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" 02156 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" 02157 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" 02158 PAVGB" %%mm1, %%mm4 \n\t" 02159 PAVGB" %%mm3, %%mm2 \n\t" 02160 "movq %%mm4, %%mm1 \n\t" 02161 "movq %%mm2, %%mm3 \n\t" 02162 "psrlq $24, %%mm4 \n\t" 02163 "psrlq $24, %%mm2 \n\t" 02164 PAVGB" %%mm1, %%mm4 \n\t" 02165 PAVGB" %%mm3, %%mm2 \n\t" 02166 "punpcklbw %%mm7, %%mm4 \n\t" 02167 "punpcklbw %%mm7, %%mm2 \n\t" 02168 #else 02169 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 02170 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" 02171 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" 02172 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" 02173 "punpcklbw %%mm7, %%mm4 \n\t" 02174 "punpcklbw %%mm7, %%mm1 \n\t" 02175 "punpcklbw %%mm7, %%mm2 \n\t" 02176 "punpcklbw %%mm7, %%mm3 \n\t" 02177 "paddw %%mm1, %%mm4 \n\t" 02178 "paddw %%mm3, %%mm2 \n\t" 02179 "paddw %%mm2, %%mm4 \n\t" 02180 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" 02181 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" 02182 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" 02183 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" 02184 "punpcklbw %%mm7, %%mm5 \n\t" 02185 "punpcklbw %%mm7, %%mm1 \n\t" 02186 "punpcklbw %%mm7, %%mm2 \n\t" 02187 "punpcklbw %%mm7, %%mm3 \n\t" 02188 "paddw %%mm1, %%mm5 \n\t" 02189 "paddw %%mm3, %%mm2 \n\t" 02190 "paddw %%mm5, %%mm2 \n\t" 02191 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 02192 "psrlw $2, %%mm4 \n\t" 02193 "psrlw $2, %%mm2 \n\t" 02194 #endif 02195 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" 02196 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" 02197 02198 "pmaddwd %%mm4, %%mm1 \n\t" 02199 "pmaddwd %%mm2, %%mm3 \n\t" 02200 "pmaddwd %%mm6, %%mm4 \n\t" 02201 "pmaddwd %%mm6, %%mm2 \n\t" 02202 #ifndef FAST_BGR2YV12 02203 "psrad $8, %%mm4 \n\t" 02204 "psrad $8, %%mm1 \n\t" 02205 "psrad $8, %%mm2 \n\t" 02206 "psrad $8, %%mm3 \n\t" 02207 #endif 02208 "packssdw %%mm2, %%mm4 \n\t" 02209 "packssdw %%mm3, %%mm1 \n\t" 02210 "pmaddwd %%mm5, %%mm4 \n\t" 02211 "pmaddwd %%mm5, %%mm1 \n\t" 02212 "add $24, %%"REG_d" \n\t" 02213 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 02214 "psraw $7, %%mm4 \n\t" 02215 02216 "movq %%mm0, %%mm1 \n\t" 02217 "punpckldq %%mm4, %%mm0 \n\t" 02218 "punpckhdq %%mm4, %%mm1 \n\t" 02219 "packsswb %%mm1, %%mm0 \n\t" 02220 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" 02221 "movd %%mm0, (%2, %%"REG_a") \n\t" 02222 "punpckhdq %%mm0, %%mm0 \n\t" 02223 "movd %%mm0, (%3, %%"REG_a") \n\t" 02224 "add $4, %%"REG_a" \n\t" 02225 " js 1b \n\t" 02226 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) 02227 : "%"REG_a, "%"REG_d 02228 ); 02229 02230 udst += chromStride; 02231 vdst += chromStride; 02232 src += srcStride*2; 02233 } 02234 02235 __asm__ volatile(EMMS" \n\t" 02236 SFENCE" \n\t" 02237 :::"memory"); 02238 #else 02239 y=0; 02240 #endif 02241 for (; y<height; y+=2) { 02242 long i; 02243 for (i=0; i<chromWidth; i++) { 02244 unsigned int b = src[6*i+0]; 02245 unsigned int g = src[6*i+1]; 02246 unsigned int r = src[6*i+2]; 02247 02248 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 02249 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; 02250 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; 02251 02252 udst[i] = U; 02253 vdst[i] = V; 02254 ydst[2*i] = Y; 02255 02256 b = src[6*i+3]; 02257 g = src[6*i+4]; 02258 r = src[6*i+5]; 02259 02260 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 02261 ydst[2*i+1] = Y; 02262 } 02263 ydst += lumStride; 02264 src += srcStride; 02265 02266 for (i=0; i<chromWidth; i++) { 02267 unsigned int b = src[6*i+0]; 02268 unsigned int g = src[6*i+1]; 02269 unsigned int r = src[6*i+2]; 02270 02271 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 02272 02273 ydst[2*i] = Y; 02274 02275 b = src[6*i+3]; 02276 g = src[6*i+4]; 02277 r = src[6*i+5]; 02278 02279 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 02280 ydst[2*i+1] = Y; 02281 } 02282 udst += chromStride; 02283 vdst += chromStride; 02284 ydst += lumStride; 02285 src += srcStride; 02286 } 02287 } 02288 02289 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, 02290 long width, long height, long src1Stride, 02291 long src2Stride, long dstStride) 02292 { 02293 long h; 02294 02295 for (h=0; h < height; h++) { 02296 long w; 02297 02298 #if HAVE_MMX 02299 #if HAVE_SSE2 02300 __asm__( 02301 "xor %%"REG_a", %%"REG_a" \n\t" 02302 "1: \n\t" 02303 PREFETCH" 64(%1, %%"REG_a") \n\t" 02304 PREFETCH" 64(%2, %%"REG_a") \n\t" 02305 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" 02306 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" 02307 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" 02308 "punpcklbw %%xmm2, %%xmm0 \n\t" 02309 "punpckhbw %%xmm2, %%xmm1 \n\t" 02310 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" 02311 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" 02312 "add $16, %%"REG_a" \n\t" 02313 "cmp %3, %%"REG_a" \n\t" 02314 " jb 1b \n\t" 02315 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 02316 : "memory", "%"REG_a"" 02317 ); 02318 #else 02319 __asm__( 02320 "xor %%"REG_a", %%"REG_a" \n\t" 02321 "1: \n\t" 02322 PREFETCH" 64(%1, %%"REG_a") \n\t" 02323 PREFETCH" 64(%2, %%"REG_a") \n\t" 02324 "movq (%1, %%"REG_a"), %%mm0 \n\t" 02325 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" 02326 "movq %%mm0, %%mm1 \n\t" 02327 "movq %%mm2, %%mm3 \n\t" 02328 "movq (%2, %%"REG_a"), %%mm4 \n\t" 02329 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" 02330 "punpcklbw %%mm4, %%mm0 \n\t" 02331 "punpckhbw %%mm4, %%mm1 \n\t" 02332 "punpcklbw %%mm5, %%mm2 \n\t" 02333 "punpckhbw %%mm5, %%mm3 \n\t" 02334 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" 02335 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" 02336 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" 02337 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" 02338 "add $16, %%"REG_a" \n\t" 02339 "cmp %3, %%"REG_a" \n\t" 02340 " jb 1b \n\t" 02341 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 02342 : "memory", "%"REG_a 02343 ); 02344 #endif 02345 for (w= (width&(~15)); w < width; w++) { 02346 dest[2*w+0] = src1[w]; 02347 dest[2*w+1] = src2[w]; 02348 } 02349 #else 02350 for (w=0; w < width; w++) { 02351 dest[2*w+0] = src1[w]; 02352 dest[2*w+1] = src2[w]; 02353 } 02354 #endif 02355 dest += dstStride; 02356 src1 += src1Stride; 02357 src2 += src2Stride; 02358 } 02359 #if HAVE_MMX 02360 __asm__( 02361 EMMS" \n\t" 02362 SFENCE" \n\t" 02363 ::: "memory" 02364 ); 02365 #endif 02366 } 02367 02368 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, 02369 uint8_t *dst1, uint8_t *dst2, 02370 long width, long height, 02371 long srcStride1, long srcStride2, 02372 long dstStride1, long dstStride2) 02373 { 02374 x86_reg y; 02375 long x,w,h; 02376 w=width/2; h=height/2; 02377 #if HAVE_MMX 02378 __asm__ volatile( 02379 PREFETCH" %0 \n\t" 02380 PREFETCH" %1 \n\t" 02381 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); 02382 #endif 02383 for (y=0;y<h;y++) { 02384 const uint8_t* s1=src1+srcStride1*(y>>1); 02385 uint8_t* d=dst1+dstStride1*y; 02386 x=0; 02387 #if HAVE_MMX 02388 for (;x<w-31;x+=32) { 02389 __asm__ volatile( 02390 PREFETCH" 32%1 \n\t" 02391 "movq %1, %%mm0 \n\t" 02392 "movq 8%1, %%mm2 \n\t" 02393 "movq 16%1, %%mm4 \n\t" 02394 "movq 24%1, %%mm6 \n\t" 02395 "movq %%mm0, %%mm1 \n\t" 02396 "movq %%mm2, %%mm3 \n\t" 02397 "movq %%mm4, %%mm5 \n\t" 02398 "movq %%mm6, %%mm7 \n\t" 02399 "punpcklbw %%mm0, %%mm0 \n\t" 02400 "punpckhbw %%mm1, %%mm1 \n\t" 02401 "punpcklbw %%mm2, %%mm2 \n\t" 02402 "punpckhbw %%mm3, %%mm3 \n\t" 02403 "punpcklbw %%mm4, %%mm4 \n\t" 02404 "punpckhbw %%mm5, %%mm5 \n\t" 02405 "punpcklbw %%mm6, %%mm6 \n\t" 02406 "punpckhbw %%mm7, %%mm7 \n\t" 02407 MOVNTQ" %%mm0, %0 \n\t" 02408 MOVNTQ" %%mm1, 8%0 \n\t" 02409 MOVNTQ" %%mm2, 16%0 \n\t" 02410 MOVNTQ" %%mm3, 24%0 \n\t" 02411 MOVNTQ" %%mm4, 32%0 \n\t" 02412 MOVNTQ" %%mm5, 40%0 \n\t" 02413 MOVNTQ" %%mm6, 48%0 \n\t" 02414 MOVNTQ" %%mm7, 56%0" 02415 :"=m"(d[2*x]) 02416 :"m"(s1[x]) 02417 :"memory"); 02418 } 02419 #endif 02420 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; 02421 } 02422 for (y=0;y<h;y++) { 02423 const uint8_t* s2=src2+srcStride2*(y>>1); 02424 uint8_t* d=dst2+dstStride2*y; 02425 x=0; 02426 #if HAVE_MMX 02427 for (;x<w-31;x+=32) { 02428 __asm__ volatile( 02429 PREFETCH" 32%1 \n\t" 02430 "movq %1, %%mm0 \n\t" 02431 "movq 8%1, %%mm2 \n\t" 02432 "movq 16%1, %%mm4 \n\t" 02433 "movq 24%1, %%mm6 \n\t" 02434 "movq %%mm0, %%mm1 \n\t" 02435 "movq %%mm2, %%mm3 \n\t" 02436 "movq %%mm4, %%mm5 \n\t" 02437 "movq %%mm6, %%mm7 \n\t" 02438 "punpcklbw %%mm0, %%mm0 \n\t" 02439 "punpckhbw %%mm1, %%mm1 \n\t" 02440 "punpcklbw %%mm2, %%mm2 \n\t" 02441 "punpckhbw %%mm3, %%mm3 \n\t" 02442 "punpcklbw %%mm4, %%mm4 \n\t" 02443 "punpckhbw %%mm5, %%mm5 \n\t" 02444 "punpcklbw %%mm6, %%mm6 \n\t" 02445 "punpckhbw %%mm7, %%mm7 \n\t" 02446 MOVNTQ" %%mm0, %0 \n\t" 02447 MOVNTQ" %%mm1, 8%0 \n\t" 02448 MOVNTQ" %%mm2, 16%0 \n\t" 02449 MOVNTQ" %%mm3, 24%0 \n\t" 02450 MOVNTQ" %%mm4, 32%0 \n\t" 02451 MOVNTQ" %%mm5, 40%0 \n\t" 02452 MOVNTQ" %%mm6, 48%0 \n\t" 02453 MOVNTQ" %%mm7, 56%0" 02454 :"=m"(d[2*x]) 02455 :"m"(s2[x]) 02456 :"memory"); 02457 } 02458 #endif 02459 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; 02460 } 02461 #if HAVE_MMX 02462 __asm__( 02463 EMMS" \n\t" 02464 SFENCE" \n\t" 02465 ::: "memory" 02466 ); 02467 #endif 02468 } 02469 02470 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, 02471 uint8_t *dst, 02472 long width, long height, 02473 long srcStride1, long srcStride2, 02474 long srcStride3, long dstStride) 02475 { 02476 x86_reg x; 02477 long y,w,h; 02478 w=width/2; h=height; 02479 for (y=0;y<h;y++) { 02480 const uint8_t* yp=src1+srcStride1*y; 02481 const uint8_t* up=src2+srcStride2*(y>>2); 02482 const uint8_t* vp=src3+srcStride3*(y>>2); 02483 uint8_t* d=dst+dstStride*y; 02484 x=0; 02485 #if HAVE_MMX 02486 for (;x<w-7;x+=8) { 02487 __asm__ volatile( 02488 PREFETCH" 32(%1, %0) \n\t" 02489 PREFETCH" 32(%2, %0) \n\t" 02490 PREFETCH" 32(%3, %0) \n\t" 02491 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 02492 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ 02493 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ 02494 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 02495 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ 02496 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ 02497 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ 02498 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ 02499 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ 02500 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ 02501 02502 "movq %%mm1, %%mm6 \n\t" 02503 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ 02504 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ 02505 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ 02506 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" 02507 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" 02508 02509 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ 02510 "movq 8(%1, %0, 4), %%mm0 \n\t" 02511 "movq %%mm0, %%mm3 \n\t" 02512 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ 02513 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ 02514 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" 02515 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" 02516 02517 "movq %%mm4, %%mm6 \n\t" 02518 "movq 16(%1, %0, 4), %%mm0 \n\t" 02519 "movq %%mm0, %%mm3 \n\t" 02520 "punpcklbw %%mm5, %%mm4 \n\t" 02521 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ 02522 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ 02523 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" 02524 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" 02525 02526 "punpckhbw %%mm5, %%mm6 \n\t" 02527 "movq 24(%1, %0, 4), %%mm0 \n\t" 02528 "movq %%mm0, %%mm3 \n\t" 02529 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ 02530 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ 02531 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" 02532 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" 02533 02534 : "+r" (x) 02535 : "r"(yp), "r" (up), "r"(vp), "r"(d) 02536 :"memory"); 02537 } 02538 #endif 02539 for (; x<w; x++) { 02540 const long x2 = x<<2; 02541 d[8*x+0] = yp[x2]; 02542 d[8*x+1] = up[x]; 02543 d[8*x+2] = yp[x2+1]; 02544 d[8*x+3] = vp[x]; 02545 d[8*x+4] = yp[x2+2]; 02546 d[8*x+5] = up[x]; 02547 d[8*x+6] = yp[x2+3]; 02548 d[8*x+7] = vp[x]; 02549 } 02550 } 02551 #if HAVE_MMX 02552 __asm__( 02553 EMMS" \n\t" 02554 SFENCE" \n\t" 02555 ::: "memory" 02556 ); 02557 #endif 02558 } 02559 02560 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) 02561 { 02562 dst += count; 02563 src += 2*count; 02564 count= - count; 02565 02566 #if HAVE_MMX 02567 if(count <= -16) { 02568 count += 15; 02569 __asm__ volatile( 02570 "pcmpeqw %%mm7, %%mm7 \n\t" 02571 "psrlw $8, %%mm7 \n\t" 02572 "1: \n\t" 02573 "movq -30(%1, %0, 2), %%mm0 \n\t" 02574 "movq -22(%1, %0, 2), %%mm1 \n\t" 02575 "movq -14(%1, %0, 2), %%mm2 \n\t" 02576 "movq -6(%1, %0, 2), %%mm3 \n\t" 02577 "pand %%mm7, %%mm0 \n\t" 02578 "pand %%mm7, %%mm1 \n\t" 02579 "pand %%mm7, %%mm2 \n\t" 02580 "pand %%mm7, %%mm3 \n\t" 02581 "packuswb %%mm1, %%mm0 \n\t" 02582 "packuswb %%mm3, %%mm2 \n\t" 02583 MOVNTQ" %%mm0,-15(%2, %0) \n\t" 02584 MOVNTQ" %%mm2,- 7(%2, %0) \n\t" 02585 "add $16, %0 \n\t" 02586 " js 1b \n\t" 02587 : "+r"(count) 02588 : "r"(src), "r"(dst) 02589 ); 02590 count -= 15; 02591 } 02592 #endif 02593 while(count<0) { 02594 dst[count]= src[2*count]; 02595 count++; 02596 } 02597 } 02598 02599 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02600 { 02601 dst0+= count; 02602 dst1+= count; 02603 src += 4*count; 02604 count= - count; 02605 #if HAVE_MMX 02606 if(count <= -8) { 02607 count += 7; 02608 __asm__ volatile( 02609 "pcmpeqw %%mm7, %%mm7 \n\t" 02610 "psrlw $8, %%mm7 \n\t" 02611 "1: \n\t" 02612 "movq -28(%1, %0, 4), %%mm0 \n\t" 02613 "movq -20(%1, %0, 4), %%mm1 \n\t" 02614 "movq -12(%1, %0, 4), %%mm2 \n\t" 02615 "movq -4(%1, %0, 4), %%mm3 \n\t" 02616 "pand %%mm7, %%mm0 \n\t" 02617 "pand %%mm7, %%mm1 \n\t" 02618 "pand %%mm7, %%mm2 \n\t" 02619 "pand %%mm7, %%mm3 \n\t" 02620 "packuswb %%mm1, %%mm0 \n\t" 02621 "packuswb %%mm3, %%mm2 \n\t" 02622 "movq %%mm0, %%mm1 \n\t" 02623 "movq %%mm2, %%mm3 \n\t" 02624 "psrlw $8, %%mm0 \n\t" 02625 "psrlw $8, %%mm2 \n\t" 02626 "pand %%mm7, %%mm1 \n\t" 02627 "pand %%mm7, %%mm3 \n\t" 02628 "packuswb %%mm2, %%mm0 \n\t" 02629 "packuswb %%mm3, %%mm1 \n\t" 02630 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 02631 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 02632 "add $8, %0 \n\t" 02633 " js 1b \n\t" 02634 : "+r"(count) 02635 : "r"(src), "r"(dst0), "r"(dst1) 02636 ); 02637 count -= 7; 02638 } 02639 #endif 02640 while(count<0) { 02641 dst0[count]= src[4*count+0]; 02642 dst1[count]= src[4*count+2]; 02643 count++; 02644 } 02645 } 02646 02647 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02648 { 02649 dst0 += count; 02650 dst1 += count; 02651 src0 += 4*count; 02652 src1 += 4*count; 02653 count= - count; 02654 #ifdef PAVGB 02655 if(count <= -8) { 02656 count += 7; 02657 __asm__ volatile( 02658 "pcmpeqw %%mm7, %%mm7 \n\t" 02659 "psrlw $8, %%mm7 \n\t" 02660 "1: \n\t" 02661 "movq -28(%1, %0, 4), %%mm0 \n\t" 02662 "movq -20(%1, %0, 4), %%mm1 \n\t" 02663 "movq -12(%1, %0, 4), %%mm2 \n\t" 02664 "movq -4(%1, %0, 4), %%mm3 \n\t" 02665 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 02666 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 02667 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 02668 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 02669 "pand %%mm7, %%mm0 \n\t" 02670 "pand %%mm7, %%mm1 \n\t" 02671 "pand %%mm7, %%mm2 \n\t" 02672 "pand %%mm7, %%mm3 \n\t" 02673 "packuswb %%mm1, %%mm0 \n\t" 02674 "packuswb %%mm3, %%mm2 \n\t" 02675 "movq %%mm0, %%mm1 \n\t" 02676 "movq %%mm2, %%mm3 \n\t" 02677 "psrlw $8, %%mm0 \n\t" 02678 "psrlw $8, %%mm2 \n\t" 02679 "pand %%mm7, %%mm1 \n\t" 02680 "pand %%mm7, %%mm3 \n\t" 02681 "packuswb %%mm2, %%mm0 \n\t" 02682 "packuswb %%mm3, %%mm1 \n\t" 02683 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 02684 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 02685 "add $8, %0 \n\t" 02686 " js 1b \n\t" 02687 : "+r"(count) 02688 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) 02689 ); 02690 count -= 7; 02691 } 02692 #endif 02693 while(count<0) { 02694 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; 02695 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; 02696 count++; 02697 } 02698 } 02699 02700 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02701 { 02702 dst0+= count; 02703 dst1+= count; 02704 src += 4*count; 02705 count= - count; 02706 #if HAVE_MMX 02707 if(count <= -8) { 02708 count += 7; 02709 __asm__ volatile( 02710 "pcmpeqw %%mm7, %%mm7 \n\t" 02711 "psrlw $8, %%mm7 \n\t" 02712 "1: \n\t" 02713 "movq -28(%1, %0, 4), %%mm0 \n\t" 02714 "movq -20(%1, %0, 4), %%mm1 \n\t" 02715 "movq -12(%1, %0, 4), %%mm2 \n\t" 02716 "movq -4(%1, %0, 4), %%mm3 \n\t" 02717 "psrlw $8, %%mm0 \n\t" 02718 "psrlw $8, %%mm1 \n\t" 02719 "psrlw $8, %%mm2 \n\t" 02720 "psrlw $8, %%mm3 \n\t" 02721 "packuswb %%mm1, %%mm0 \n\t" 02722 "packuswb %%mm3, %%mm2 \n\t" 02723 "movq %%mm0, %%mm1 \n\t" 02724 "movq %%mm2, %%mm3 \n\t" 02725 "psrlw $8, %%mm0 \n\t" 02726 "psrlw $8, %%mm2 \n\t" 02727 "pand %%mm7, %%mm1 \n\t" 02728 "pand %%mm7, %%mm3 \n\t" 02729 "packuswb %%mm2, %%mm0 \n\t" 02730 "packuswb %%mm3, %%mm1 \n\t" 02731 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 02732 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 02733 "add $8, %0 \n\t" 02734 " js 1b \n\t" 02735 : "+r"(count) 02736 : "r"(src), "r"(dst0), "r"(dst1) 02737 ); 02738 count -= 7; 02739 } 02740 #endif 02741 src++; 02742 while(count<0) { 02743 dst0[count]= src[4*count+0]; 02744 dst1[count]= src[4*count+2]; 02745 count++; 02746 } 02747 } 02748 02749 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02750 { 02751 dst0 += count; 02752 dst1 += count; 02753 src0 += 4*count; 02754 src1 += 4*count; 02755 count= - count; 02756 #ifdef PAVGB 02757 if(count <= -8) { 02758 count += 7; 02759 __asm__ volatile( 02760 "pcmpeqw %%mm7, %%mm7 \n\t" 02761 "psrlw $8, %%mm7 \n\t" 02762 "1: \n\t" 02763 "movq -28(%1, %0, 4), %%mm0 \n\t" 02764 "movq -20(%1, %0, 4), %%mm1 \n\t" 02765 "movq -12(%1, %0, 4), %%mm2 \n\t" 02766 "movq -4(%1, %0, 4), %%mm3 \n\t" 02767 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 02768 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 02769 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 02770 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 02771 "psrlw $8, %%mm0 \n\t" 02772 "psrlw $8, %%mm1 \n\t" 02773 "psrlw $8, %%mm2 \n\t" 02774 "psrlw $8, %%mm3 \n\t" 02775 "packuswb %%mm1, %%mm0 \n\t" 02776 "packuswb %%mm3, %%mm2 \n\t" 02777 "movq %%mm0, %%mm1 \n\t" 02778 "movq %%mm2, %%mm3 \n\t" 02779 "psrlw $8, %%mm0 \n\t" 02780 "psrlw $8, %%mm2 \n\t" 02781 "pand %%mm7, %%mm1 \n\t" 02782 "pand %%mm7, %%mm3 \n\t" 02783 "packuswb %%mm2, %%mm0 \n\t" 02784 "packuswb %%mm3, %%mm1 \n\t" 02785 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 02786 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 02787 "add $8, %0 \n\t" 02788 " js 1b \n\t" 02789 : "+r"(count) 02790 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) 02791 ); 02792 count -= 7; 02793 } 02794 #endif 02795 src0++; 02796 src1++; 02797 while(count<0) { 02798 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; 02799 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; 02800 count++; 02801 } 02802 } 02803 02804 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02805 long width, long height, 02806 long lumStride, long chromStride, long srcStride) 02807 { 02808 long y; 02809 const long chromWidth= -((-width)>>1); 02810 02811 for (y=0; y<height; y++) { 02812 RENAME(extract_even)(src, ydst, width); 02813 if(y&1) { 02814 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth); 02815 udst+= chromStride; 02816 vdst+= chromStride; 02817 } 02818 02819 src += srcStride; 02820 ydst+= lumStride; 02821 } 02822 #if HAVE_MMX 02823 __asm__( 02824 EMMS" \n\t" 02825 SFENCE" \n\t" 02826 ::: "memory" 02827 ); 02828 #endif 02829 } 02830 02831 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02832 long width, long height, 02833 long lumStride, long chromStride, long srcStride) 02834 { 02835 long y; 02836 const long chromWidth= -((-width)>>1); 02837 02838 for (y=0; y<height; y++) { 02839 RENAME(extract_even)(src, ydst, width); 02840 RENAME(extract_odd2)(src, udst, vdst, chromWidth); 02841 02842 src += srcStride; 02843 ydst+= lumStride; 02844 udst+= chromStride; 02845 vdst+= chromStride; 02846 } 02847 #if HAVE_MMX 02848 __asm__( 02849 EMMS" \n\t" 02850 SFENCE" \n\t" 02851 ::: "memory" 02852 ); 02853 #endif 02854 } 02855 02856 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02857 long width, long height, 02858 long lumStride, long chromStride, long srcStride) 02859 { 02860 long y; 02861 const long chromWidth= -((-width)>>1); 02862 02863 for (y=0; y<height; y++) { 02864 RENAME(extract_even)(src+1, ydst, width); 02865 if(y&1) { 02866 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); 02867 udst+= chromStride; 02868 vdst+= chromStride; 02869 } 02870 02871 src += srcStride; 02872 ydst+= lumStride; 02873 } 02874 #if HAVE_MMX 02875 __asm__( 02876 EMMS" \n\t" 02877 SFENCE" \n\t" 02878 ::: "memory" 02879 ); 02880 #endif 02881 } 02882 02883 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02884 long width, long height, 02885 long lumStride, long chromStride, long srcStride) 02886 { 02887 long y; 02888 const long chromWidth= -((-width)>>1); 02889 02890 for (y=0; y<height; y++) { 02891 RENAME(extract_even)(src+1, ydst, width); 02892 RENAME(extract_even2)(src, udst, vdst, chromWidth); 02893 02894 src += srcStride; 02895 ydst+= lumStride; 02896 udst+= chromStride; 02897 vdst+= chromStride; 02898 } 02899 #if HAVE_MMX 02900 __asm__( 02901 EMMS" \n\t" 02902 SFENCE" \n\t" 02903 ::: "memory" 02904 ); 02905 #endif 02906 } 02907 02908 static inline void RENAME(rgb2rgb_init)(void) 02909 { 02910 rgb15to16 = RENAME(rgb15to16); 02911 rgb15tobgr24 = RENAME(rgb15tobgr24); 02912 rgb15to32 = RENAME(rgb15to32); 02913 rgb16tobgr24 = RENAME(rgb16tobgr24); 02914 rgb16to32 = RENAME(rgb16to32); 02915 rgb16to15 = RENAME(rgb16to15); 02916 rgb24tobgr16 = RENAME(rgb24tobgr16); 02917 rgb24tobgr15 = RENAME(rgb24tobgr15); 02918 rgb24tobgr32 = RENAME(rgb24tobgr32); 02919 rgb32to16 = RENAME(rgb32to16); 02920 rgb32to15 = RENAME(rgb32to15); 02921 rgb32tobgr24 = RENAME(rgb32tobgr24); 02922 rgb24to15 = RENAME(rgb24to15); 02923 rgb24to16 = RENAME(rgb24to16); 02924 rgb24tobgr24 = RENAME(rgb24tobgr24); 02925 rgb32tobgr32 = RENAME(rgb32tobgr32); 02926 rgb32tobgr16 = RENAME(rgb32tobgr16); 02927 rgb32tobgr15 = RENAME(rgb32tobgr15); 02928 yv12toyuy2 = RENAME(yv12toyuy2); 02929 yv12touyvy = RENAME(yv12touyvy); 02930 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); 02931 yuv422ptouyvy = RENAME(yuv422ptouyvy); 02932 yuy2toyv12 = RENAME(yuy2toyv12); 02933 // yvu9toyv12 = RENAME(yvu9toyv12); 02934 planar2x = RENAME(planar2x); 02935 rgb24toyv12 = RENAME(rgb24toyv12); 02936 interleaveBytes = RENAME(interleaveBytes); 02937 vu9_to_vu12 = RENAME(vu9_to_vu12); 02938 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); 02939 02940 uyvytoyuv420 = RENAME(uyvytoyuv420); 02941 uyvytoyuv422 = RENAME(uyvytoyuv422); 02942 yuyvtoyuv420 = RENAME(yuyvtoyuv420); 02943 yuyvtoyuv422 = RENAME(yuyvtoyuv422); 02944 }