Libav
|
00001 /* 00002 * The simplest mpeg encoder (well, it was the simplest!) 00003 * Copyright (c) 2000,2001 Fabrice Bellard 00004 * 00005 * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> 00006 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> 00007 * 00008 * This file is part of FFmpeg. 00009 * 00010 * FFmpeg is free software; you can redistribute it and/or 00011 * modify it under the terms of the GNU Lesser General Public 00012 * License as published by the Free Software Foundation; either 00013 * version 2.1 of the License, or (at your option) any later version. 00014 * 00015 * FFmpeg is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 * Lesser General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU Lesser General Public 00021 * License along with FFmpeg; if not, write to the Free Software 00022 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00023 */ 00024 00025 #include "libavutil/x86_cpu.h" 00026 #include "libavcodec/avcodec.h" 00027 #include "libavcodec/dsputil.h" 00028 #include "libavcodec/mpegvideo.h" 00029 #include "dsputil_mmx.h" 00030 00031 extern uint16_t inv_zigzag_direct16[64]; 00032 00033 00034 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 00035 DCTELEM *block, int n, int qscale) 00036 { 00037 x86_reg level, qmul, qadd, nCoeffs; 00038 00039 qmul = qscale << 1; 00040 00041 assert(s->block_last_index[n]>=0 || s->h263_aic); 00042 00043 if (!s->h263_aic) { 00044 if (n < 4) 00045 level = block[0] * s->y_dc_scale; 00046 else 00047 level = block[0] * s->c_dc_scale; 00048 qadd = (qscale - 1) | 1; 00049 }else{ 00050 qadd = 0; 00051 level= block[0]; 00052 } 00053 if(s->ac_pred) 00054 nCoeffs=63; 00055 else 00056 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 00057 //printf("%d %d ", qmul, qadd); 00058 __asm__ volatile( 00059 "movd %1, %%mm6 \n\t" //qmul 00060 "packssdw %%mm6, %%mm6 \n\t" 00061 "packssdw %%mm6, %%mm6 \n\t" 00062 "movd %2, %%mm5 \n\t" //qadd 00063 "pxor %%mm7, %%mm7 \n\t" 00064 "packssdw %%mm5, %%mm5 \n\t" 00065 "packssdw %%mm5, %%mm5 \n\t" 00066 "psubw %%mm5, %%mm7 \n\t" 00067 "pxor %%mm4, %%mm4 \n\t" 00068 ASMALIGN(4) 00069 "1: \n\t" 00070 "movq (%0, %3), %%mm0 \n\t" 00071 "movq 8(%0, %3), %%mm1 \n\t" 00072 00073 "pmullw %%mm6, %%mm0 \n\t" 00074 "pmullw %%mm6, %%mm1 \n\t" 00075 00076 "movq (%0, %3), %%mm2 \n\t" 00077 "movq 8(%0, %3), %%mm3 \n\t" 00078 00079 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00080 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00081 00082 "pxor %%mm2, %%mm0 \n\t" 00083 "pxor %%mm3, %%mm1 \n\t" 00084 00085 "paddw %%mm7, %%mm0 \n\t" 00086 "paddw %%mm7, %%mm1 \n\t" 00087 00088 "pxor %%mm0, %%mm2 \n\t" 00089 "pxor %%mm1, %%mm3 \n\t" 00090 00091 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 00092 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 00093 00094 "pandn %%mm2, %%mm0 \n\t" 00095 "pandn %%mm3, %%mm1 \n\t" 00096 00097 "movq %%mm0, (%0, %3) \n\t" 00098 "movq %%mm1, 8(%0, %3) \n\t" 00099 00100 "add $16, %3 \n\t" 00101 "jng 1b \n\t" 00102 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 00103 : "memory" 00104 ); 00105 block[0]= level; 00106 } 00107 00108 00109 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 00110 DCTELEM *block, int n, int qscale) 00111 { 00112 x86_reg qmul, qadd, nCoeffs; 00113 00114 qmul = qscale << 1; 00115 qadd = (qscale - 1) | 1; 00116 00117 assert(s->block_last_index[n]>=0 || s->h263_aic); 00118 00119 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 00120 //printf("%d %d ", qmul, qadd); 00121 __asm__ volatile( 00122 "movd %1, %%mm6 \n\t" //qmul 00123 "packssdw %%mm6, %%mm6 \n\t" 00124 "packssdw %%mm6, %%mm6 \n\t" 00125 "movd %2, %%mm5 \n\t" //qadd 00126 "pxor %%mm7, %%mm7 \n\t" 00127 "packssdw %%mm5, %%mm5 \n\t" 00128 "packssdw %%mm5, %%mm5 \n\t" 00129 "psubw %%mm5, %%mm7 \n\t" 00130 "pxor %%mm4, %%mm4 \n\t" 00131 ASMALIGN(4) 00132 "1: \n\t" 00133 "movq (%0, %3), %%mm0 \n\t" 00134 "movq 8(%0, %3), %%mm1 \n\t" 00135 00136 "pmullw %%mm6, %%mm0 \n\t" 00137 "pmullw %%mm6, %%mm1 \n\t" 00138 00139 "movq (%0, %3), %%mm2 \n\t" 00140 "movq 8(%0, %3), %%mm3 \n\t" 00141 00142 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00143 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00144 00145 "pxor %%mm2, %%mm0 \n\t" 00146 "pxor %%mm3, %%mm1 \n\t" 00147 00148 "paddw %%mm7, %%mm0 \n\t" 00149 "paddw %%mm7, %%mm1 \n\t" 00150 00151 "pxor %%mm0, %%mm2 \n\t" 00152 "pxor %%mm1, %%mm3 \n\t" 00153 00154 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 00155 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 00156 00157 "pandn %%mm2, %%mm0 \n\t" 00158 "pandn %%mm3, %%mm1 \n\t" 00159 00160 "movq %%mm0, (%0, %3) \n\t" 00161 "movq %%mm1, 8(%0, %3) \n\t" 00162 00163 "add $16, %3 \n\t" 00164 "jng 1b \n\t" 00165 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 00166 : "memory" 00167 ); 00168 } 00169 00170 00171 /* 00172 NK: 00173 Note: looking at PARANOID: 00174 "enable all paranoid tests for rounding, overflows, etc..." 00175 00176 #ifdef PARANOID 00177 if (level < -2048 || level > 2047) 00178 fprintf(stderr, "unquant error %d %d\n", i, level); 00179 #endif 00180 We can suppose that result of two multiplications can't be greater than 0xFFFF 00181 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid 00182 a complex multiplication. 00183 ===================================================== 00184 Full formula for multiplication of 2 integer numbers 00185 which are represent as high:low words: 00186 input: value1 = high1:low1 00187 value2 = high2:low2 00188 output: value3 = value1*value2 00189 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) 00190 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 00191 but this algorithm will compute only 0x66cb0ce4 00192 this limited by 16-bit size of operands 00193 --------------------------------- 00194 tlow1 = high1*low2 00195 tlow2 = high2*low1 00196 tlow1 = tlow1 + tlow2 00197 high3:low3 = low1*low2 00198 high3 += tlow1 00199 */ 00200 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 00201 DCTELEM *block, int n, int qscale) 00202 { 00203 x86_reg nCoeffs; 00204 const uint16_t *quant_matrix; 00205 int block0; 00206 00207 assert(s->block_last_index[n]>=0); 00208 00209 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 00210 00211 if (n < 4) 00212 block0 = block[0] * s->y_dc_scale; 00213 else 00214 block0 = block[0] * s->c_dc_scale; 00215 /* XXX: only mpeg1 */ 00216 quant_matrix = s->intra_matrix; 00217 __asm__ volatile( 00218 "pcmpeqw %%mm7, %%mm7 \n\t" 00219 "psrlw $15, %%mm7 \n\t" 00220 "movd %2, %%mm6 \n\t" 00221 "packssdw %%mm6, %%mm6 \n\t" 00222 "packssdw %%mm6, %%mm6 \n\t" 00223 "mov %3, %%"REG_a" \n\t" 00224 ASMALIGN(4) 00225 "1: \n\t" 00226 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00227 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00228 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00229 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00230 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00231 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00232 "pxor %%mm2, %%mm2 \n\t" 00233 "pxor %%mm3, %%mm3 \n\t" 00234 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00235 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00236 "pxor %%mm2, %%mm0 \n\t" 00237 "pxor %%mm3, %%mm1 \n\t" 00238 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00239 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00240 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 00241 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 00242 "pxor %%mm4, %%mm4 \n\t" 00243 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00244 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00245 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00246 "psraw $3, %%mm0 \n\t" 00247 "psraw $3, %%mm1 \n\t" 00248 "psubw %%mm7, %%mm0 \n\t" 00249 "psubw %%mm7, %%mm1 \n\t" 00250 "por %%mm7, %%mm0 \n\t" 00251 "por %%mm7, %%mm1 \n\t" 00252 "pxor %%mm2, %%mm0 \n\t" 00253 "pxor %%mm3, %%mm1 \n\t" 00254 "psubw %%mm2, %%mm0 \n\t" 00255 "psubw %%mm3, %%mm1 \n\t" 00256 "pandn %%mm0, %%mm4 \n\t" 00257 "pandn %%mm1, %%mm5 \n\t" 00258 "movq %%mm4, (%0, %%"REG_a") \n\t" 00259 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00260 00261 "add $16, %%"REG_a" \n\t" 00262 "js 1b \n\t" 00263 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 00264 : "%"REG_a, "memory" 00265 ); 00266 block[0]= block0; 00267 } 00268 00269 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 00270 DCTELEM *block, int n, int qscale) 00271 { 00272 x86_reg nCoeffs; 00273 const uint16_t *quant_matrix; 00274 00275 assert(s->block_last_index[n]>=0); 00276 00277 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 00278 00279 quant_matrix = s->inter_matrix; 00280 __asm__ volatile( 00281 "pcmpeqw %%mm7, %%mm7 \n\t" 00282 "psrlw $15, %%mm7 \n\t" 00283 "movd %2, %%mm6 \n\t" 00284 "packssdw %%mm6, %%mm6 \n\t" 00285 "packssdw %%mm6, %%mm6 \n\t" 00286 "mov %3, %%"REG_a" \n\t" 00287 ASMALIGN(4) 00288 "1: \n\t" 00289 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00290 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00291 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00292 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00293 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00294 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00295 "pxor %%mm2, %%mm2 \n\t" 00296 "pxor %%mm3, %%mm3 \n\t" 00297 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00298 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00299 "pxor %%mm2, %%mm0 \n\t" 00300 "pxor %%mm3, %%mm1 \n\t" 00301 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00302 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00303 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 00304 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 00305 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 00306 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 00307 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 00308 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 00309 "pxor %%mm4, %%mm4 \n\t" 00310 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00311 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00312 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00313 "psraw $4, %%mm0 \n\t" 00314 "psraw $4, %%mm1 \n\t" 00315 "psubw %%mm7, %%mm0 \n\t" 00316 "psubw %%mm7, %%mm1 \n\t" 00317 "por %%mm7, %%mm0 \n\t" 00318 "por %%mm7, %%mm1 \n\t" 00319 "pxor %%mm2, %%mm0 \n\t" 00320 "pxor %%mm3, %%mm1 \n\t" 00321 "psubw %%mm2, %%mm0 \n\t" 00322 "psubw %%mm3, %%mm1 \n\t" 00323 "pandn %%mm0, %%mm4 \n\t" 00324 "pandn %%mm1, %%mm5 \n\t" 00325 "movq %%mm4, (%0, %%"REG_a") \n\t" 00326 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00327 00328 "add $16, %%"REG_a" \n\t" 00329 "js 1b \n\t" 00330 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 00331 : "%"REG_a, "memory" 00332 ); 00333 } 00334 00335 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 00336 DCTELEM *block, int n, int qscale) 00337 { 00338 x86_reg nCoeffs; 00339 const uint16_t *quant_matrix; 00340 int block0; 00341 00342 assert(s->block_last_index[n]>=0); 00343 00344 if(s->alternate_scan) nCoeffs= 63; //FIXME 00345 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 00346 00347 if (n < 4) 00348 block0 = block[0] * s->y_dc_scale; 00349 else 00350 block0 = block[0] * s->c_dc_scale; 00351 quant_matrix = s->intra_matrix; 00352 __asm__ volatile( 00353 "pcmpeqw %%mm7, %%mm7 \n\t" 00354 "psrlw $15, %%mm7 \n\t" 00355 "movd %2, %%mm6 \n\t" 00356 "packssdw %%mm6, %%mm6 \n\t" 00357 "packssdw %%mm6, %%mm6 \n\t" 00358 "mov %3, %%"REG_a" \n\t" 00359 ASMALIGN(4) 00360 "1: \n\t" 00361 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00362 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00363 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00364 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00365 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00366 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00367 "pxor %%mm2, %%mm2 \n\t" 00368 "pxor %%mm3, %%mm3 \n\t" 00369 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00370 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00371 "pxor %%mm2, %%mm0 \n\t" 00372 "pxor %%mm3, %%mm1 \n\t" 00373 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00374 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00375 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 00376 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 00377 "pxor %%mm4, %%mm4 \n\t" 00378 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00379 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00380 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00381 "psraw $3, %%mm0 \n\t" 00382 "psraw $3, %%mm1 \n\t" 00383 "pxor %%mm2, %%mm0 \n\t" 00384 "pxor %%mm3, %%mm1 \n\t" 00385 "psubw %%mm2, %%mm0 \n\t" 00386 "psubw %%mm3, %%mm1 \n\t" 00387 "pandn %%mm0, %%mm4 \n\t" 00388 "pandn %%mm1, %%mm5 \n\t" 00389 "movq %%mm4, (%0, %%"REG_a") \n\t" 00390 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00391 00392 "add $16, %%"REG_a" \n\t" 00393 "jng 1b \n\t" 00394 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 00395 : "%"REG_a, "memory" 00396 ); 00397 block[0]= block0; 00398 //Note, we do not do mismatch control for intra as errors cannot accumulate 00399 } 00400 00401 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 00402 DCTELEM *block, int n, int qscale) 00403 { 00404 x86_reg nCoeffs; 00405 const uint16_t *quant_matrix; 00406 00407 assert(s->block_last_index[n]>=0); 00408 00409 if(s->alternate_scan) nCoeffs= 63; //FIXME 00410 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 00411 00412 quant_matrix = s->inter_matrix; 00413 __asm__ volatile( 00414 "pcmpeqw %%mm7, %%mm7 \n\t" 00415 "psrlq $48, %%mm7 \n\t" 00416 "movd %2, %%mm6 \n\t" 00417 "packssdw %%mm6, %%mm6 \n\t" 00418 "packssdw %%mm6, %%mm6 \n\t" 00419 "mov %3, %%"REG_a" \n\t" 00420 ASMALIGN(4) 00421 "1: \n\t" 00422 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00423 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00424 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00425 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00426 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00427 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00428 "pxor %%mm2, %%mm2 \n\t" 00429 "pxor %%mm3, %%mm3 \n\t" 00430 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00431 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00432 "pxor %%mm2, %%mm0 \n\t" 00433 "pxor %%mm3, %%mm1 \n\t" 00434 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00435 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00436 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 00437 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 00438 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 00439 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 00440 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 00441 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 00442 "pxor %%mm4, %%mm4 \n\t" 00443 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00444 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00445 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00446 "psrlw $4, %%mm0 \n\t" 00447 "psrlw $4, %%mm1 \n\t" 00448 "pxor %%mm2, %%mm0 \n\t" 00449 "pxor %%mm3, %%mm1 \n\t" 00450 "psubw %%mm2, %%mm0 \n\t" 00451 "psubw %%mm3, %%mm1 \n\t" 00452 "pandn %%mm0, %%mm4 \n\t" 00453 "pandn %%mm1, %%mm5 \n\t" 00454 "pxor %%mm4, %%mm7 \n\t" 00455 "pxor %%mm5, %%mm7 \n\t" 00456 "movq %%mm4, (%0, %%"REG_a") \n\t" 00457 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00458 00459 "add $16, %%"REG_a" \n\t" 00460 "jng 1b \n\t" 00461 "movd 124(%0, %3), %%mm0 \n\t" 00462 "movq %%mm7, %%mm6 \n\t" 00463 "psrlq $32, %%mm7 \n\t" 00464 "pxor %%mm6, %%mm7 \n\t" 00465 "movq %%mm7, %%mm6 \n\t" 00466 "psrlq $16, %%mm7 \n\t" 00467 "pxor %%mm6, %%mm7 \n\t" 00468 "pslld $31, %%mm7 \n\t" 00469 "psrlq $15, %%mm7 \n\t" 00470 "pxor %%mm7, %%mm0 \n\t" 00471 "movd %%mm0, 124(%0, %3) \n\t" 00472 00473 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) 00474 : "%"REG_a, "memory" 00475 ); 00476 } 00477 00478 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 00479 const int intra= s->mb_intra; 00480 int *sum= s->dct_error_sum[intra]; 00481 uint16_t *offset= s->dct_offset[intra]; 00482 00483 s->dct_count[intra]++; 00484 00485 __asm__ volatile( 00486 "pxor %%mm7, %%mm7 \n\t" 00487 "1: \n\t" 00488 "pxor %%mm0, %%mm0 \n\t" 00489 "pxor %%mm1, %%mm1 \n\t" 00490 "movq (%0), %%mm2 \n\t" 00491 "movq 8(%0), %%mm3 \n\t" 00492 "pcmpgtw %%mm2, %%mm0 \n\t" 00493 "pcmpgtw %%mm3, %%mm1 \n\t" 00494 "pxor %%mm0, %%mm2 \n\t" 00495 "pxor %%mm1, %%mm3 \n\t" 00496 "psubw %%mm0, %%mm2 \n\t" 00497 "psubw %%mm1, %%mm3 \n\t" 00498 "movq %%mm2, %%mm4 \n\t" 00499 "movq %%mm3, %%mm5 \n\t" 00500 "psubusw (%2), %%mm2 \n\t" 00501 "psubusw 8(%2), %%mm3 \n\t" 00502 "pxor %%mm0, %%mm2 \n\t" 00503 "pxor %%mm1, %%mm3 \n\t" 00504 "psubw %%mm0, %%mm2 \n\t" 00505 "psubw %%mm1, %%mm3 \n\t" 00506 "movq %%mm2, (%0) \n\t" 00507 "movq %%mm3, 8(%0) \n\t" 00508 "movq %%mm4, %%mm2 \n\t" 00509 "movq %%mm5, %%mm3 \n\t" 00510 "punpcklwd %%mm7, %%mm4 \n\t" 00511 "punpckhwd %%mm7, %%mm2 \n\t" 00512 "punpcklwd %%mm7, %%mm5 \n\t" 00513 "punpckhwd %%mm7, %%mm3 \n\t" 00514 "paddd (%1), %%mm4 \n\t" 00515 "paddd 8(%1), %%mm2 \n\t" 00516 "paddd 16(%1), %%mm5 \n\t" 00517 "paddd 24(%1), %%mm3 \n\t" 00518 "movq %%mm4, (%1) \n\t" 00519 "movq %%mm2, 8(%1) \n\t" 00520 "movq %%mm5, 16(%1) \n\t" 00521 "movq %%mm3, 24(%1) \n\t" 00522 "add $16, %0 \n\t" 00523 "add $32, %1 \n\t" 00524 "add $16, %2 \n\t" 00525 "cmp %3, %0 \n\t" 00526 " jb 1b \n\t" 00527 : "+r" (block), "+r" (sum), "+r" (offset) 00528 : "r"(block+64) 00529 ); 00530 } 00531 00532 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ 00533 const int intra= s->mb_intra; 00534 int *sum= s->dct_error_sum[intra]; 00535 uint16_t *offset= s->dct_offset[intra]; 00536 00537 s->dct_count[intra]++; 00538 00539 __asm__ volatile( 00540 "pxor %%xmm7, %%xmm7 \n\t" 00541 "1: \n\t" 00542 "pxor %%xmm0, %%xmm0 \n\t" 00543 "pxor %%xmm1, %%xmm1 \n\t" 00544 "movdqa (%0), %%xmm2 \n\t" 00545 "movdqa 16(%0), %%xmm3 \n\t" 00546 "pcmpgtw %%xmm2, %%xmm0 \n\t" 00547 "pcmpgtw %%xmm3, %%xmm1 \n\t" 00548 "pxor %%xmm0, %%xmm2 \n\t" 00549 "pxor %%xmm1, %%xmm3 \n\t" 00550 "psubw %%xmm0, %%xmm2 \n\t" 00551 "psubw %%xmm1, %%xmm3 \n\t" 00552 "movdqa %%xmm2, %%xmm4 \n\t" 00553 "movdqa %%xmm3, %%xmm5 \n\t" 00554 "psubusw (%2), %%xmm2 \n\t" 00555 "psubusw 16(%2), %%xmm3 \n\t" 00556 "pxor %%xmm0, %%xmm2 \n\t" 00557 "pxor %%xmm1, %%xmm3 \n\t" 00558 "psubw %%xmm0, %%xmm2 \n\t" 00559 "psubw %%xmm1, %%xmm3 \n\t" 00560 "movdqa %%xmm2, (%0) \n\t" 00561 "movdqa %%xmm3, 16(%0) \n\t" 00562 "movdqa %%xmm4, %%xmm6 \n\t" 00563 "movdqa %%xmm5, %%xmm0 \n\t" 00564 "punpcklwd %%xmm7, %%xmm4 \n\t" 00565 "punpckhwd %%xmm7, %%xmm6 \n\t" 00566 "punpcklwd %%xmm7, %%xmm5 \n\t" 00567 "punpckhwd %%xmm7, %%xmm0 \n\t" 00568 "paddd (%1), %%xmm4 \n\t" 00569 "paddd 16(%1), %%xmm6 \n\t" 00570 "paddd 32(%1), %%xmm5 \n\t" 00571 "paddd 48(%1), %%xmm0 \n\t" 00572 "movdqa %%xmm4, (%1) \n\t" 00573 "movdqa %%xmm6, 16(%1) \n\t" 00574 "movdqa %%xmm5, 32(%1) \n\t" 00575 "movdqa %%xmm0, 48(%1) \n\t" 00576 "add $32, %0 \n\t" 00577 "add $64, %1 \n\t" 00578 "add $32, %2 \n\t" 00579 "cmp %3, %0 \n\t" 00580 " jb 1b \n\t" 00581 : "+r" (block), "+r" (sum), "+r" (offset) 00582 : "r"(block+64) 00583 ); 00584 } 00585 00586 #if HAVE_SSSE3 00587 #define HAVE_SSSE3_BAK 00588 #endif 00589 #undef HAVE_SSSE3 00590 #define HAVE_SSSE3 0 00591 00592 #undef HAVE_SSE2 00593 #undef HAVE_MMX2 00594 #define HAVE_SSE2 0 00595 #define HAVE_MMX2 0 00596 #define RENAME(a) a ## _MMX 00597 #define RENAMEl(a) a ## _mmx 00598 #include "mpegvideo_mmx_template.c" 00599 00600 #undef HAVE_MMX2 00601 #define HAVE_MMX2 1 00602 #undef RENAME 00603 #undef RENAMEl 00604 #define RENAME(a) a ## _MMX2 00605 #define RENAMEl(a) a ## _mmx2 00606 #include "mpegvideo_mmx_template.c" 00607 00608 #undef HAVE_SSE2 00609 #define HAVE_SSE2 1 00610 #undef RENAME 00611 #undef RENAMEl 00612 #define RENAME(a) a ## _SSE2 00613 #define RENAMEl(a) a ## _sse2 00614 #include "mpegvideo_mmx_template.c" 00615 00616 #ifdef HAVE_SSSE3_BAK 00617 #undef HAVE_SSSE3 00618 #define HAVE_SSSE3 1 00619 #undef RENAME 00620 #undef RENAMEl 00621 #define RENAME(a) a ## _SSSE3 00622 #define RENAMEl(a) a ## _sse2 00623 #include "mpegvideo_mmx_template.c" 00624 #endif 00625 00626 void MPV_common_init_mmx(MpegEncContext *s) 00627 { 00628 if (mm_flags & FF_MM_MMX) { 00629 const int dct_algo = s->avctx->dct_algo; 00630 00631 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 00632 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 00633 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 00634 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 00635 if(!(s->flags & CODEC_FLAG_BITEXACT)) 00636 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 00637 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 00638 00639 if (mm_flags & FF_MM_SSE2) { 00640 s->denoise_dct= denoise_dct_sse2; 00641 } else { 00642 s->denoise_dct= denoise_dct_mmx; 00643 } 00644 00645 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 00646 #if HAVE_SSSE3 00647 if(mm_flags & FF_MM_SSSE3){ 00648 s->dct_quantize= dct_quantize_SSSE3; 00649 } else 00650 #endif 00651 if(mm_flags & FF_MM_SSE2){ 00652 s->dct_quantize= dct_quantize_SSE2; 00653 } else if(mm_flags & FF_MM_MMX2){ 00654 s->dct_quantize= dct_quantize_MMX2; 00655 } else { 00656 s->dct_quantize= dct_quantize_MMX; 00657 } 00658 } 00659 } 00660 }