• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/mpegvideo_mmx.c

Go to the documentation of this file.
00001 /*
00002  * The simplest mpeg encoder (well, it was the simplest!)
00003  * Copyright (c) 2000,2001 Fabrice Bellard
00004  *
00005  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
00006  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of FFmpeg.
00009  *
00010  * FFmpeg is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * FFmpeg is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with FFmpeg; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/avcodec.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/mpegvideo.h"
00029 #include "dsputil_mmx.h"
00030 
00031 extern uint16_t inv_zigzag_direct16[64];
00032 
00033 
00034 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00035                                   DCTELEM *block, int n, int qscale)
00036 {
00037     x86_reg level, qmul, qadd, nCoeffs;
00038 
00039     qmul = qscale << 1;
00040 
00041     assert(s->block_last_index[n]>=0 || s->h263_aic);
00042 
00043     if (!s->h263_aic) {
00044         if (n < 4)
00045             level = block[0] * s->y_dc_scale;
00046         else
00047             level = block[0] * s->c_dc_scale;
00048         qadd = (qscale - 1) | 1;
00049     }else{
00050         qadd = 0;
00051         level= block[0];
00052     }
00053     if(s->ac_pred)
00054         nCoeffs=63;
00055     else
00056         nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00057 //printf("%d %d  ", qmul, qadd);
00058 __asm__ volatile(
00059                 "movd %1, %%mm6                 \n\t" //qmul
00060                 "packssdw %%mm6, %%mm6          \n\t"
00061                 "packssdw %%mm6, %%mm6          \n\t"
00062                 "movd %2, %%mm5                 \n\t" //qadd
00063                 "pxor %%mm7, %%mm7              \n\t"
00064                 "packssdw %%mm5, %%mm5          \n\t"
00065                 "packssdw %%mm5, %%mm5          \n\t"
00066                 "psubw %%mm5, %%mm7             \n\t"
00067                 "pxor %%mm4, %%mm4              \n\t"
00068                 ASMALIGN(4)
00069                 "1:                             \n\t"
00070                 "movq (%0, %3), %%mm0           \n\t"
00071                 "movq 8(%0, %3), %%mm1          \n\t"
00072 
00073                 "pmullw %%mm6, %%mm0            \n\t"
00074                 "pmullw %%mm6, %%mm1            \n\t"
00075 
00076                 "movq (%0, %3), %%mm2           \n\t"
00077                 "movq 8(%0, %3), %%mm3          \n\t"
00078 
00079                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00080                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00081 
00082                 "pxor %%mm2, %%mm0              \n\t"
00083                 "pxor %%mm3, %%mm1              \n\t"
00084 
00085                 "paddw %%mm7, %%mm0             \n\t"
00086                 "paddw %%mm7, %%mm1             \n\t"
00087 
00088                 "pxor %%mm0, %%mm2              \n\t"
00089                 "pxor %%mm1, %%mm3              \n\t"
00090 
00091                 "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
00092                 "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
00093 
00094                 "pandn %%mm2, %%mm0             \n\t"
00095                 "pandn %%mm3, %%mm1             \n\t"
00096 
00097                 "movq %%mm0, (%0, %3)           \n\t"
00098                 "movq %%mm1, 8(%0, %3)          \n\t"
00099 
00100                 "add $16, %3                    \n\t"
00101                 "jng 1b                         \n\t"
00102                 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00103                 : "memory"
00104         );
00105         block[0]= level;
00106 }
00107 
00108 
00109 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00110                                   DCTELEM *block, int n, int qscale)
00111 {
00112     x86_reg qmul, qadd, nCoeffs;
00113 
00114     qmul = qscale << 1;
00115     qadd = (qscale - 1) | 1;
00116 
00117     assert(s->block_last_index[n]>=0 || s->h263_aic);
00118 
00119     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00120 //printf("%d %d  ", qmul, qadd);
00121 __asm__ volatile(
00122                 "movd %1, %%mm6                 \n\t" //qmul
00123                 "packssdw %%mm6, %%mm6          \n\t"
00124                 "packssdw %%mm6, %%mm6          \n\t"
00125                 "movd %2, %%mm5                 \n\t" //qadd
00126                 "pxor %%mm7, %%mm7              \n\t"
00127                 "packssdw %%mm5, %%mm5          \n\t"
00128                 "packssdw %%mm5, %%mm5          \n\t"
00129                 "psubw %%mm5, %%mm7             \n\t"
00130                 "pxor %%mm4, %%mm4              \n\t"
00131                 ASMALIGN(4)
00132                 "1:                             \n\t"
00133                 "movq (%0, %3), %%mm0           \n\t"
00134                 "movq 8(%0, %3), %%mm1          \n\t"
00135 
00136                 "pmullw %%mm6, %%mm0            \n\t"
00137                 "pmullw %%mm6, %%mm1            \n\t"
00138 
00139                 "movq (%0, %3), %%mm2           \n\t"
00140                 "movq 8(%0, %3), %%mm3          \n\t"
00141 
00142                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00143                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00144 
00145                 "pxor %%mm2, %%mm0              \n\t"
00146                 "pxor %%mm3, %%mm1              \n\t"
00147 
00148                 "paddw %%mm7, %%mm0             \n\t"
00149                 "paddw %%mm7, %%mm1             \n\t"
00150 
00151                 "pxor %%mm0, %%mm2              \n\t"
00152                 "pxor %%mm1, %%mm3              \n\t"
00153 
00154                 "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
00155                 "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
00156 
00157                 "pandn %%mm2, %%mm0             \n\t"
00158                 "pandn %%mm3, %%mm1             \n\t"
00159 
00160                 "movq %%mm0, (%0, %3)           \n\t"
00161                 "movq %%mm1, 8(%0, %3)          \n\t"
00162 
00163                 "add $16, %3                    \n\t"
00164                 "jng 1b                         \n\t"
00165                 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00166                 : "memory"
00167         );
00168 }
00169 
00170 
00171 /*
00172   NK:
00173   Note: looking at PARANOID:
00174   "enable all paranoid tests for rounding, overflows, etc..."
00175 
00176 #ifdef PARANOID
00177                 if (level < -2048 || level > 2047)
00178                     fprintf(stderr, "unquant error %d %d\n", i, level);
00179 #endif
00180   We can suppose that result of two multiplications can't be greater than 0xFFFF
00181   i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
00182   a complex multiplication.
00183 =====================================================
00184  Full formula for multiplication of 2 integer numbers
00185  which are represent as high:low words:
00186  input: value1 = high1:low1
00187         value2 = high2:low2
00188  output: value3 = value1*value2
00189  value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
00190  this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
00191  but this algorithm will compute only 0x66cb0ce4
00192  this limited by 16-bit size of operands
00193  ---------------------------------
00194  tlow1 = high1*low2
00195  tlow2 = high2*low1
00196  tlow1 = tlow1 + tlow2
00197  high3:low3 = low1*low2
00198  high3 += tlow1
00199 */
00200 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00201                                      DCTELEM *block, int n, int qscale)
00202 {
00203     x86_reg nCoeffs;
00204     const uint16_t *quant_matrix;
00205     int block0;
00206 
00207     assert(s->block_last_index[n]>=0);
00208 
00209     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00210 
00211     if (n < 4)
00212         block0 = block[0] * s->y_dc_scale;
00213     else
00214         block0 = block[0] * s->c_dc_scale;
00215     /* XXX: only mpeg1 */
00216     quant_matrix = s->intra_matrix;
00217 __asm__ volatile(
00218                 "pcmpeqw %%mm7, %%mm7           \n\t"
00219                 "psrlw $15, %%mm7               \n\t"
00220                 "movd %2, %%mm6                 \n\t"
00221                 "packssdw %%mm6, %%mm6          \n\t"
00222                 "packssdw %%mm6, %%mm6          \n\t"
00223                 "mov %3, %%"REG_a"              \n\t"
00224                 ASMALIGN(4)
00225                 "1:                             \n\t"
00226                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00227                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00228                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00229                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00230                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00231                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00232                 "pxor %%mm2, %%mm2              \n\t"
00233                 "pxor %%mm3, %%mm3              \n\t"
00234                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00235                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00236                 "pxor %%mm2, %%mm0              \n\t"
00237                 "pxor %%mm3, %%mm1              \n\t"
00238                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00239                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00240                 "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
00241                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
00242                 "pxor %%mm4, %%mm4              \n\t"
00243                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00244                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00245                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00246                 "psraw $3, %%mm0                \n\t"
00247                 "psraw $3, %%mm1                \n\t"
00248                 "psubw %%mm7, %%mm0             \n\t"
00249                 "psubw %%mm7, %%mm1             \n\t"
00250                 "por %%mm7, %%mm0               \n\t"
00251                 "por %%mm7, %%mm1               \n\t"
00252                 "pxor %%mm2, %%mm0              \n\t"
00253                 "pxor %%mm3, %%mm1              \n\t"
00254                 "psubw %%mm2, %%mm0             \n\t"
00255                 "psubw %%mm3, %%mm1             \n\t"
00256                 "pandn %%mm0, %%mm4             \n\t"
00257                 "pandn %%mm1, %%mm5             \n\t"
00258                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00259                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00260 
00261                 "add $16, %%"REG_a"             \n\t"
00262                 "js 1b                          \n\t"
00263                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00264                 : "%"REG_a, "memory"
00265         );
00266     block[0]= block0;
00267 }
00268 
00269 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00270                                      DCTELEM *block, int n, int qscale)
00271 {
00272     x86_reg nCoeffs;
00273     const uint16_t *quant_matrix;
00274 
00275     assert(s->block_last_index[n]>=0);
00276 
00277     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00278 
00279         quant_matrix = s->inter_matrix;
00280 __asm__ volatile(
00281                 "pcmpeqw %%mm7, %%mm7           \n\t"
00282                 "psrlw $15, %%mm7               \n\t"
00283                 "movd %2, %%mm6                 \n\t"
00284                 "packssdw %%mm6, %%mm6          \n\t"
00285                 "packssdw %%mm6, %%mm6          \n\t"
00286                 "mov %3, %%"REG_a"              \n\t"
00287                 ASMALIGN(4)
00288                 "1:                             \n\t"
00289                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00290                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00291                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00292                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00293                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00294                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00295                 "pxor %%mm2, %%mm2              \n\t"
00296                 "pxor %%mm3, %%mm3              \n\t"
00297                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00298                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00299                 "pxor %%mm2, %%mm0              \n\t"
00300                 "pxor %%mm3, %%mm1              \n\t"
00301                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00302                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00303                 "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
00304                 "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
00305                 "paddw %%mm7, %%mm0             \n\t" // abs(block[i])*2 + 1
00306                 "paddw %%mm7, %%mm1             \n\t" // abs(block[i])*2 + 1
00307                 "pmullw %%mm4, %%mm0            \n\t" // (abs(block[i])*2 + 1)*q
00308                 "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 1)*q
00309                 "pxor %%mm4, %%mm4              \n\t"
00310                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00311                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00312                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00313                 "psraw $4, %%mm0                \n\t"
00314                 "psraw $4, %%mm1                \n\t"
00315                 "psubw %%mm7, %%mm0             \n\t"
00316                 "psubw %%mm7, %%mm1             \n\t"
00317                 "por %%mm7, %%mm0               \n\t"
00318                 "por %%mm7, %%mm1               \n\t"
00319                 "pxor %%mm2, %%mm0              \n\t"
00320                 "pxor %%mm3, %%mm1              \n\t"
00321                 "psubw %%mm2, %%mm0             \n\t"
00322                 "psubw %%mm3, %%mm1             \n\t"
00323                 "pandn %%mm0, %%mm4             \n\t"
00324                 "pandn %%mm1, %%mm5             \n\t"
00325                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00326                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00327 
00328                 "add $16, %%"REG_a"             \n\t"
00329                 "js 1b                          \n\t"
00330                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00331                 : "%"REG_a, "memory"
00332         );
00333 }
00334 
00335 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00336                                      DCTELEM *block, int n, int qscale)
00337 {
00338     x86_reg nCoeffs;
00339     const uint16_t *quant_matrix;
00340     int block0;
00341 
00342     assert(s->block_last_index[n]>=0);
00343 
00344     if(s->alternate_scan) nCoeffs= 63; //FIXME
00345     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00346 
00347     if (n < 4)
00348         block0 = block[0] * s->y_dc_scale;
00349     else
00350         block0 = block[0] * s->c_dc_scale;
00351     quant_matrix = s->intra_matrix;
00352 __asm__ volatile(
00353                 "pcmpeqw %%mm7, %%mm7           \n\t"
00354                 "psrlw $15, %%mm7               \n\t"
00355                 "movd %2, %%mm6                 \n\t"
00356                 "packssdw %%mm6, %%mm6          \n\t"
00357                 "packssdw %%mm6, %%mm6          \n\t"
00358                 "mov %3, %%"REG_a"              \n\t"
00359                 ASMALIGN(4)
00360                 "1:                             \n\t"
00361                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00362                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00363                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00364                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00365                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00366                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00367                 "pxor %%mm2, %%mm2              \n\t"
00368                 "pxor %%mm3, %%mm3              \n\t"
00369                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00370                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00371                 "pxor %%mm2, %%mm0              \n\t"
00372                 "pxor %%mm3, %%mm1              \n\t"
00373                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00374                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00375                 "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
00376                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
00377                 "pxor %%mm4, %%mm4              \n\t"
00378                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00379                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00380                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00381                 "psraw $3, %%mm0                \n\t"
00382                 "psraw $3, %%mm1                \n\t"
00383                 "pxor %%mm2, %%mm0              \n\t"
00384                 "pxor %%mm3, %%mm1              \n\t"
00385                 "psubw %%mm2, %%mm0             \n\t"
00386                 "psubw %%mm3, %%mm1             \n\t"
00387                 "pandn %%mm0, %%mm4             \n\t"
00388                 "pandn %%mm1, %%mm5             \n\t"
00389                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00390                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00391 
00392                 "add $16, %%"REG_a"             \n\t"
00393                 "jng 1b                         \n\t"
00394                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00395                 : "%"REG_a, "memory"
00396         );
00397     block[0]= block0;
00398         //Note, we do not do mismatch control for intra as errors cannot accumulate
00399 }
00400 
00401 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00402                                      DCTELEM *block, int n, int qscale)
00403 {
00404     x86_reg nCoeffs;
00405     const uint16_t *quant_matrix;
00406 
00407     assert(s->block_last_index[n]>=0);
00408 
00409     if(s->alternate_scan) nCoeffs= 63; //FIXME
00410     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00411 
00412         quant_matrix = s->inter_matrix;
00413 __asm__ volatile(
00414                 "pcmpeqw %%mm7, %%mm7           \n\t"
00415                 "psrlq $48, %%mm7               \n\t"
00416                 "movd %2, %%mm6                 \n\t"
00417                 "packssdw %%mm6, %%mm6          \n\t"
00418                 "packssdw %%mm6, %%mm6          \n\t"
00419                 "mov %3, %%"REG_a"              \n\t"
00420                 ASMALIGN(4)
00421                 "1:                             \n\t"
00422                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00423                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00424                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00425                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00426                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00427                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00428                 "pxor %%mm2, %%mm2              \n\t"
00429                 "pxor %%mm3, %%mm3              \n\t"
00430                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00431                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00432                 "pxor %%mm2, %%mm0              \n\t"
00433                 "pxor %%mm3, %%mm1              \n\t"
00434                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00435                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00436                 "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
00437                 "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
00438                 "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*2*q
00439                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*2*q
00440                 "paddw %%mm4, %%mm0             \n\t" // (abs(block[i])*2 + 1)*q
00441                 "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 1)*q
00442                 "pxor %%mm4, %%mm4              \n\t"
00443                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00444                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00445                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00446                 "psrlw $4, %%mm0                \n\t"
00447                 "psrlw $4, %%mm1                \n\t"
00448                 "pxor %%mm2, %%mm0              \n\t"
00449                 "pxor %%mm3, %%mm1              \n\t"
00450                 "psubw %%mm2, %%mm0             \n\t"
00451                 "psubw %%mm3, %%mm1             \n\t"
00452                 "pandn %%mm0, %%mm4             \n\t"
00453                 "pandn %%mm1, %%mm5             \n\t"
00454                 "pxor %%mm4, %%mm7              \n\t"
00455                 "pxor %%mm5, %%mm7              \n\t"
00456                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00457                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00458 
00459                 "add $16, %%"REG_a"             \n\t"
00460                 "jng 1b                         \n\t"
00461                 "movd 124(%0, %3), %%mm0        \n\t"
00462                 "movq %%mm7, %%mm6              \n\t"
00463                 "psrlq $32, %%mm7               \n\t"
00464                 "pxor %%mm6, %%mm7              \n\t"
00465                 "movq %%mm7, %%mm6              \n\t"
00466                 "psrlq $16, %%mm7               \n\t"
00467                 "pxor %%mm6, %%mm7              \n\t"
00468                 "pslld $31, %%mm7               \n\t"
00469                 "psrlq $15, %%mm7               \n\t"
00470                 "pxor %%mm7, %%mm0              \n\t"
00471                 "movd %%mm0, 124(%0, %3)        \n\t"
00472 
00473                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
00474                 : "%"REG_a, "memory"
00475         );
00476 }
00477 
00478 static void  denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00479     const int intra= s->mb_intra;
00480     int *sum= s->dct_error_sum[intra];
00481     uint16_t *offset= s->dct_offset[intra];
00482 
00483     s->dct_count[intra]++;
00484 
00485     __asm__ volatile(
00486         "pxor %%mm7, %%mm7                      \n\t"
00487         "1:                                     \n\t"
00488         "pxor %%mm0, %%mm0                      \n\t"
00489         "pxor %%mm1, %%mm1                      \n\t"
00490         "movq (%0), %%mm2                       \n\t"
00491         "movq 8(%0), %%mm3                      \n\t"
00492         "pcmpgtw %%mm2, %%mm0                   \n\t"
00493         "pcmpgtw %%mm3, %%mm1                   \n\t"
00494         "pxor %%mm0, %%mm2                      \n\t"
00495         "pxor %%mm1, %%mm3                      \n\t"
00496         "psubw %%mm0, %%mm2                     \n\t"
00497         "psubw %%mm1, %%mm3                     \n\t"
00498         "movq %%mm2, %%mm4                      \n\t"
00499         "movq %%mm3, %%mm5                      \n\t"
00500         "psubusw (%2), %%mm2                    \n\t"
00501         "psubusw 8(%2), %%mm3                   \n\t"
00502         "pxor %%mm0, %%mm2                      \n\t"
00503         "pxor %%mm1, %%mm3                      \n\t"
00504         "psubw %%mm0, %%mm2                     \n\t"
00505         "psubw %%mm1, %%mm3                     \n\t"
00506         "movq %%mm2, (%0)                       \n\t"
00507         "movq %%mm3, 8(%0)                      \n\t"
00508         "movq %%mm4, %%mm2                      \n\t"
00509         "movq %%mm5, %%mm3                      \n\t"
00510         "punpcklwd %%mm7, %%mm4                 \n\t"
00511         "punpckhwd %%mm7, %%mm2                 \n\t"
00512         "punpcklwd %%mm7, %%mm5                 \n\t"
00513         "punpckhwd %%mm7, %%mm3                 \n\t"
00514         "paddd (%1), %%mm4                      \n\t"
00515         "paddd 8(%1), %%mm2                     \n\t"
00516         "paddd 16(%1), %%mm5                    \n\t"
00517         "paddd 24(%1), %%mm3                    \n\t"
00518         "movq %%mm4, (%1)                       \n\t"
00519         "movq %%mm2, 8(%1)                      \n\t"
00520         "movq %%mm5, 16(%1)                     \n\t"
00521         "movq %%mm3, 24(%1)                     \n\t"
00522         "add $16, %0                            \n\t"
00523         "add $32, %1                            \n\t"
00524         "add $16, %2                            \n\t"
00525         "cmp %3, %0                             \n\t"
00526             " jb 1b                             \n\t"
00527         : "+r" (block), "+r" (sum), "+r" (offset)
00528         : "r"(block+64)
00529     );
00530 }
00531 
00532 static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00533     const int intra= s->mb_intra;
00534     int *sum= s->dct_error_sum[intra];
00535     uint16_t *offset= s->dct_offset[intra];
00536 
00537     s->dct_count[intra]++;
00538 
00539     __asm__ volatile(
00540         "pxor %%xmm7, %%xmm7                    \n\t"
00541         "1:                                     \n\t"
00542         "pxor %%xmm0, %%xmm0                    \n\t"
00543         "pxor %%xmm1, %%xmm1                    \n\t"
00544         "movdqa (%0), %%xmm2                    \n\t"
00545         "movdqa 16(%0), %%xmm3                  \n\t"
00546         "pcmpgtw %%xmm2, %%xmm0                 \n\t"
00547         "pcmpgtw %%xmm3, %%xmm1                 \n\t"
00548         "pxor %%xmm0, %%xmm2                    \n\t"
00549         "pxor %%xmm1, %%xmm3                    \n\t"
00550         "psubw %%xmm0, %%xmm2                   \n\t"
00551         "psubw %%xmm1, %%xmm3                   \n\t"
00552         "movdqa %%xmm2, %%xmm4                  \n\t"
00553         "movdqa %%xmm3, %%xmm5                  \n\t"
00554         "psubusw (%2), %%xmm2                   \n\t"
00555         "psubusw 16(%2), %%xmm3                 \n\t"
00556         "pxor %%xmm0, %%xmm2                    \n\t"
00557         "pxor %%xmm1, %%xmm3                    \n\t"
00558         "psubw %%xmm0, %%xmm2                   \n\t"
00559         "psubw %%xmm1, %%xmm3                   \n\t"
00560         "movdqa %%xmm2, (%0)                    \n\t"
00561         "movdqa %%xmm3, 16(%0)                  \n\t"
00562         "movdqa %%xmm4, %%xmm6                  \n\t"
00563         "movdqa %%xmm5, %%xmm0                  \n\t"
00564         "punpcklwd %%xmm7, %%xmm4               \n\t"
00565         "punpckhwd %%xmm7, %%xmm6               \n\t"
00566         "punpcklwd %%xmm7, %%xmm5               \n\t"
00567         "punpckhwd %%xmm7, %%xmm0               \n\t"
00568         "paddd (%1), %%xmm4                     \n\t"
00569         "paddd 16(%1), %%xmm6                   \n\t"
00570         "paddd 32(%1), %%xmm5                   \n\t"
00571         "paddd 48(%1), %%xmm0                   \n\t"
00572         "movdqa %%xmm4, (%1)                    \n\t"
00573         "movdqa %%xmm6, 16(%1)                  \n\t"
00574         "movdqa %%xmm5, 32(%1)                  \n\t"
00575         "movdqa %%xmm0, 48(%1)                  \n\t"
00576         "add $32, %0                            \n\t"
00577         "add $64, %1                            \n\t"
00578         "add $32, %2                            \n\t"
00579         "cmp %3, %0                             \n\t"
00580             " jb 1b                             \n\t"
00581         : "+r" (block), "+r" (sum), "+r" (offset)
00582         : "r"(block+64)
00583     );
00584 }
00585 
00586 #if HAVE_SSSE3
00587 #define HAVE_SSSE3_BAK
00588 #endif
00589 #undef HAVE_SSSE3
00590 #define HAVE_SSSE3 0
00591 
00592 #undef HAVE_SSE2
00593 #undef HAVE_MMX2
00594 #define HAVE_SSE2 0
00595 #define HAVE_MMX2 0
00596 #define RENAME(a) a ## _MMX
00597 #define RENAMEl(a) a ## _mmx
00598 #include "mpegvideo_mmx_template.c"
00599 
00600 #undef HAVE_MMX2
00601 #define HAVE_MMX2 1
00602 #undef RENAME
00603 #undef RENAMEl
00604 #define RENAME(a) a ## _MMX2
00605 #define RENAMEl(a) a ## _mmx2
00606 #include "mpegvideo_mmx_template.c"
00607 
00608 #undef HAVE_SSE2
00609 #define HAVE_SSE2 1
00610 #undef RENAME
00611 #undef RENAMEl
00612 #define RENAME(a) a ## _SSE2
00613 #define RENAMEl(a) a ## _sse2
00614 #include "mpegvideo_mmx_template.c"
00615 
00616 #ifdef HAVE_SSSE3_BAK
00617 #undef HAVE_SSSE3
00618 #define HAVE_SSSE3 1
00619 #undef RENAME
00620 #undef RENAMEl
00621 #define RENAME(a) a ## _SSSE3
00622 #define RENAMEl(a) a ## _sse2
00623 #include "mpegvideo_mmx_template.c"
00624 #endif
00625 
00626 void MPV_common_init_mmx(MpegEncContext *s)
00627 {
00628     if (mm_flags & FF_MM_MMX) {
00629         const int dct_algo = s->avctx->dct_algo;
00630 
00631         s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00632         s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00633         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00634         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00635         if(!(s->flags & CODEC_FLAG_BITEXACT))
00636             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00637         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00638 
00639         if (mm_flags & FF_MM_SSE2) {
00640             s->denoise_dct= denoise_dct_sse2;
00641         } else {
00642                 s->denoise_dct= denoise_dct_mmx;
00643         }
00644 
00645         if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00646 #if HAVE_SSSE3
00647             if(mm_flags & FF_MM_SSSE3){
00648                 s->dct_quantize= dct_quantize_SSSE3;
00649             } else
00650 #endif
00651             if(mm_flags & FF_MM_SSE2){
00652                 s->dct_quantize= dct_quantize_SSE2;
00653             } else if(mm_flags & FF_MM_MMX2){
00654                 s->dct_quantize= dct_quantize_MMX2;
00655             } else {
00656                 s->dct_quantize= dct_quantize_MMX;
00657             }
00658         }
00659     }
00660 }

Generated on Fri Sep 16 2011 17:17:46 for FFmpeg by  doxygen 1.7.1