• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/vp3dsp_mmx.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2004 the ffmpeg project
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "dsputil_mmx.h"
00029 #include "vp3dsp_mmx.h"
00030 
00031 extern const uint16_t ff_vp3_idct_data[];
00032 
00033 // this is off by one or two for some cases when filter_limit is greater than 63
00034 // in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
00035 // out: p1 in mm4, p2 in mm3
00036 #define VP3_LOOP_FILTER(flim) \
00037     "movq       %%mm6, %%mm7 \n\t" \
00038     "pand    "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
00039     "psrlw         $3, %%mm7 \n\t" \
00040     "pand    "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
00041     "movq       %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
00042     "pxor       %%mm4, %%mm2 \n\t" \
00043     "pand    "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
00044     "movq       %%mm2, %%mm5 \n\t" \
00045     "paddb      %%mm2, %%mm2 \n\t" \
00046     "paddb      %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
00047     "paddb      %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
00048     "pcmpeqb    %%mm0, %%mm0 \n\t" \
00049     "pxor       %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
00050     "pavgb      %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
00051     "pxor       %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
00052     "pavgb      %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
00053     "paddb   "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
00054     "pavgb      %%mm0, %%mm1 \n\t" /* 128+2+(   p2-p1  - p3) >> 2 */ \
00055     "pavgb      %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
00056     "paddusb    %%mm1, %%mm7 \n\t" /* d+128+1 */ \
00057     "movq    "MANGLE(ff_pb_81)", %%mm6 \n\t" \
00058     "psubusb    %%mm7, %%mm6 \n\t" \
00059     "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
00060 \
00061     "movq     "#flim", %%mm5 \n\t" \
00062     "pminub     %%mm5, %%mm6 \n\t" \
00063     "pminub     %%mm5, %%mm7 \n\t" \
00064     "movq       %%mm6, %%mm0 \n\t" \
00065     "movq       %%mm7, %%mm1 \n\t" \
00066     "paddb      %%mm6, %%mm6 \n\t" \
00067     "paddb      %%mm7, %%mm7 \n\t" \
00068     "pminub     %%mm5, %%mm6 \n\t" \
00069     "pminub     %%mm5, %%mm7 \n\t" \
00070     "psubb      %%mm0, %%mm6 \n\t" \
00071     "psubb      %%mm1, %%mm7 \n\t" \
00072     "paddusb    %%mm7, %%mm4 \n\t" \
00073     "psubusb    %%mm6, %%mm4 \n\t" \
00074     "psubusb    %%mm7, %%mm3 \n\t" \
00075     "paddusb    %%mm6, %%mm3 \n\t"
00076 
00077 #define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
00078     "movd "#mm", %0        \n\t" \
00079     "movw   %w0, -1"#dst0" \n\t" \
00080     "psrlq  $32, "#mm"     \n\t" \
00081     "shr    $16, %0        \n\t" \
00082     "movw   %w0, -1"#dst1" \n\t" \
00083     "movd "#mm", %0        \n\t" \
00084     "movw   %w0, -1"#dst2" \n\t" \
00085     "shr    $16, %0        \n\t" \
00086     "movw   %w0, -1"#dst3" \n\t"
00087 
00088 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
00089 {
00090     __asm__ volatile(
00091         "movq          %0, %%mm6 \n\t"
00092         "movq          %1, %%mm4 \n\t"
00093         "movq          %2, %%mm2 \n\t"
00094         "movq          %3, %%mm1 \n\t"
00095 
00096         VP3_LOOP_FILTER(%4)
00097 
00098         "movq       %%mm4, %1    \n\t"
00099         "movq       %%mm3, %2    \n\t"
00100 
00101         : "+m" (*(uint64_t*)(src - 2*stride)),
00102           "+m" (*(uint64_t*)(src - 1*stride)),
00103           "+m" (*(uint64_t*)(src + 0*stride)),
00104           "+m" (*(uint64_t*)(src + 1*stride))
00105         : "m"(*(uint64_t*)(bounding_values+129))
00106     );
00107 }
00108 
00109 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
00110 {
00111     x86_reg tmp;
00112 
00113     __asm__ volatile(
00114         "movd -2(%1),      %%mm6 \n\t"
00115         "movd -2(%1,%3),   %%mm0 \n\t"
00116         "movd -2(%1,%3,2), %%mm1 \n\t"
00117         "movd -2(%1,%4),   %%mm4 \n\t"
00118 
00119         TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
00120         VP3_LOOP_FILTER(%5)
00121         SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
00122 
00123         STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
00124         STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
00125 
00126         : "=&r"(tmp)
00127         : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
00128           "m"(*(uint64_t*)(bounding_values+129))
00129         : "memory"
00130     );
00131 }
00132 
00133 /* from original comments: The Macro does IDct on 4 1-D Dcts */
00134 #define BeginIDCT() \
00135     "movq   "I(3)", %%mm2 \n\t" \
00136     "movq   "C(3)", %%mm6 \n\t" \
00137     "movq    %%mm2, %%mm4 \n\t" \
00138     "movq   "J(5)", %%mm7 \n\t" \
00139     "pmulhw  %%mm6, %%mm4 \n\t"    /* r4 = c3*i3 - i3 */ \
00140     "movq   "C(5)", %%mm1 \n\t" \
00141     "pmulhw  %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 - i5 */ \
00142     "movq    %%mm1, %%mm5 \n\t" \
00143     "pmulhw  %%mm2, %%mm1 \n\t"    /* r1 = c5*i3 - i3 */ \
00144     "movq   "I(1)", %%mm3 \n\t" \
00145     "pmulhw  %%mm7, %%mm5 \n\t"    /* r5 = c5*i5 - i5 */ \
00146     "movq   "C(1)", %%mm0 \n\t" \
00147     "paddw   %%mm2, %%mm4 \n\t"    /* r4 = c3*i3 */ \
00148     "paddw   %%mm7, %%mm6 \n\t"    /* r6 = c3*i5 */ \
00149     "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c5*i3 */ \
00150     "movq   "J(7)", %%mm1 \n\t" \
00151     "paddw   %%mm5, %%mm7 \n\t"    /* r7 = c5*i5 */ \
00152     "movq    %%mm0, %%mm5 \n\t"    /* r5 = c1 */ \
00153     "pmulhw  %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 - i1 */ \
00154     "paddsw  %%mm7, %%mm4 \n\t"    /* r4 = C = c3*i3 + c5*i5 */ \
00155     "pmulhw  %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 - i7 */ \
00156     "movq   "C(7)", %%mm7 \n\t" \
00157     "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = D = c3*i5 - c5*i3 */ \
00158     "paddw   %%mm3, %%mm0 \n\t"    /* r0 = c1*i1 */ \
00159     "pmulhw  %%mm7, %%mm3 \n\t"    /* r3 = c7*i1 */ \
00160     "movq   "I(2)", %%mm2 \n\t" \
00161     "pmulhw  %%mm1, %%mm7 \n\t"    /* r7 = c7*i7 */ \
00162     "paddw   %%mm1, %%mm5 \n\t"    /* r5 = c1*i7 */ \
00163     "movq    %%mm2, %%mm1 \n\t"    /* r1 = i2 */ \
00164     "pmulhw "C(2)", %%mm2 \n\t"    /* r2 = c2*i2 - i2 */ \
00165     "psubsw  %%mm5, %%mm3 \n\t"    /* r3 = B = c7*i1 - c1*i7 */ \
00166     "movq   "J(6)", %%mm5 \n\t" \
00167     "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = A = c1*i1 + c7*i7 */ \
00168     "movq    %%mm5, %%mm7 \n\t"    /* r7 = i6 */ \
00169     "psubsw  %%mm4, %%mm0 \n\t"    /* r0 = A - C */ \
00170     "pmulhw "C(2)", %%mm5 \n\t"    /* r5 = c2*i6 - i6 */ \
00171     "paddw   %%mm1, %%mm2 \n\t"    /* r2 = c2*i2 */ \
00172     "pmulhw "C(6)", %%mm1 \n\t"    /* r1 = c6*i2 */ \
00173     "paddsw  %%mm4, %%mm4 \n\t"    /* r4 = C + C */ \
00174     "paddsw  %%mm0, %%mm4 \n\t"    /* r4 = C. = A + C */ \
00175     "psubsw  %%mm6, %%mm3 \n\t"    /* r3 = B - D */ \
00176     "paddw   %%mm7, %%mm5 \n\t"    /* r5 = c2*i6 */ \
00177     "paddsw  %%mm6, %%mm6 \n\t"    /* r6 = D + D */ \
00178     "pmulhw "C(6)", %%mm7 \n\t"    /* r7 = c6*i6 */ \
00179     "paddsw  %%mm3, %%mm6 \n\t"    /* r6 = D. = B + D */ \
00180     "movq    %%mm4, "I(1)"\n\t"    /* save C. at I(1) */ \
00181     "psubsw  %%mm5, %%mm1 \n\t"    /* r1 = H = c6*i2 - c2*i6 */ \
00182     "movq   "C(4)", %%mm4 \n\t" \
00183     "movq    %%mm3, %%mm5 \n\t"    /* r5 = B - D */ \
00184     "pmulhw  %%mm4, %%mm3 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
00185     "paddsw  %%mm2, %%mm7 \n\t"    /* r3 = (c4 - 1) * (B - D) */ \
00186     "movq    %%mm6, "I(2)"\n\t"    /* save D. at I(2) */ \
00187     "movq    %%mm0, %%mm2 \n\t"    /* r2 = A - C */ \
00188     "movq   "I(0)", %%mm6 \n\t" \
00189     "pmulhw  %%mm4, %%mm0 \n\t"    /* r0 = (c4 - 1) * (A - C) */ \
00190     "paddw   %%mm3, %%mm5 \n\t"    /* r5 = B. = c4 * (B - D) */ \
00191     "movq   "J(4)", %%mm3 \n\t" \
00192     "psubsw  %%mm1, %%mm5 \n\t"    /* r5 = B.. = B. - H */ \
00193     "paddw   %%mm0, %%mm2 \n\t"    /* r0 = A. = c4 * (A - C) */ \
00194     "psubsw  %%mm3, %%mm6 \n\t"    /* r6 = i0 - i4 */ \
00195     "movq    %%mm6, %%mm0 \n\t" \
00196     "pmulhw  %%mm4, %%mm6 \n\t"    /* r6 = (c4 - 1) * (i0 - i4) */ \
00197     "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = i4 + i4 */ \
00198     "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H + H */ \
00199     "paddsw  %%mm0, %%mm3 \n\t"    /* r3 = i0 + i4 */ \
00200     "paddsw  %%mm5, %%mm1 \n\t"    /* r1 = H. = B + H */ \
00201     "pmulhw  %%mm3, %%mm4 \n\t"    /* r4 = (c4 - 1) * (i0 + i4) */ \
00202     "paddsw  %%mm0, %%mm6 \n\t"    /* r6 = F = c4 * (i0 - i4) */ \
00203     "psubsw  %%mm2, %%mm6 \n\t"    /* r6 = F. = F - A. */ \
00204     "paddsw  %%mm2, %%mm2 \n\t"    /* r2 = A. + A. */ \
00205     "movq   "I(1)", %%mm0 \n\t"    /* r0 = C. */ \
00206     "paddsw  %%mm6, %%mm2 \n\t"    /* r2 = A.. = F + A. */ \
00207     "paddw   %%mm3, %%mm4 \n\t"    /* r4 = E = c4 * (i0 + i4) */ \
00208     "psubsw  %%mm1, %%mm2 \n\t"    /* r2 = R2 = A.. - H. */
00209 
00210 /* RowIDCT gets ready to transpose */
00211 #define RowIDCT() \
00212     BeginIDCT() \
00213     "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
00214     "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
00215     "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
00216     "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
00217     "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
00218     "paddsw  %%mm4, %%mm7 \n\t"    /* r1 = R1 = A.. + H. */ \
00219     "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
00220     "paddsw  %%mm3, %%mm3 \n\t" \
00221     "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
00222     "paddsw  %%mm5, %%mm5 \n\t" \
00223     "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
00224     "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
00225     "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
00226     "paddsw  %%mm0, %%mm0 \n\t" \
00227     "movq    %%mm1, "I(1)"\n\t"    /* save R1 */ \
00228     "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */
00229 
00230 /* Column IDCT normalizes and stores final results */
00231 #define ColumnIDCT() \
00232     BeginIDCT() \
00233     "paddsw "OC_8", %%mm2 \n\t"    /* adjust R2 (and R1) for shift */ \
00234     "paddsw  %%mm1, %%mm1 \n\t"    /* r1 = H. + H. */ \
00235     "paddsw  %%mm2, %%mm1 \n\t"    /* r1 = R1 = A.. + H. */ \
00236     "psraw      $4, %%mm2 \n\t"    /* r2 = NR2 */ \
00237     "psubsw  %%mm7, %%mm4 \n\t"    /* r4 = E. = E - G */ \
00238     "psraw      $4, %%mm1 \n\t"    /* r1 = NR1 */ \
00239     "movq   "I(2)", %%mm3 \n\t"    /* r3 = D. */ \
00240     "paddsw  %%mm7, %%mm7 \n\t"    /* r7 = G + G */ \
00241     "movq    %%mm2, "I(2)"\n\t"    /* store NR2 at I2 */ \
00242     "paddsw  %%mm4, %%mm7 \n\t"    /* r7 = G. = E + G */ \
00243     "movq    %%mm1, "I(1)"\n\t"    /* store NR1 at I1 */ \
00244     "psubsw  %%mm3, %%mm4 \n\t"    /* r4 = R4 = E. - D. */ \
00245     "paddsw "OC_8", %%mm4 \n\t"    /* adjust R4 (and R3) for shift */ \
00246     "paddsw  %%mm3, %%mm3 \n\t"    /* r3 = D. + D. */ \
00247     "paddsw  %%mm4, %%mm3 \n\t"    /* r3 = R3 = E. + D. */ \
00248     "psraw      $4, %%mm4 \n\t"    /* r4 = NR4 */ \
00249     "psubsw  %%mm5, %%mm6 \n\t"    /* r6 = R6 = F. - B.. */ \
00250     "psraw      $4, %%mm3 \n\t"    /* r3 = NR3 */ \
00251     "paddsw "OC_8", %%mm6 \n\t"    /* adjust R6 (and R5) for shift */ \
00252     "paddsw  %%mm5, %%mm5 \n\t"    /* r5 = B.. + B.. */ \
00253     "paddsw  %%mm6, %%mm5 \n\t"    /* r5 = R5 = F. + B.. */ \
00254     "psraw      $4, %%mm6 \n\t"    /* r6 = NR6 */ \
00255     "movq    %%mm4, "J(4)"\n\t"    /* store NR4 at J4 */ \
00256     "psraw      $4, %%mm5 \n\t"    /* r5 = NR5 */ \
00257     "movq    %%mm3, "I(3)"\n\t"    /* store NR3 at I3 */ \
00258     "psubsw  %%mm0, %%mm7 \n\t"    /* r7 = R7 = G. - C. */ \
00259     "paddsw "OC_8", %%mm7 \n\t"    /* adjust R7 (and R0) for shift */ \
00260     "paddsw  %%mm0, %%mm0 \n\t"    /* r0 = C. + C. */ \
00261     "paddsw  %%mm7, %%mm0 \n\t"    /* r0 = R0 = G. + C. */ \
00262     "psraw      $4, %%mm7 \n\t"    /* r7 = NR7 */ \
00263     "movq    %%mm6, "J(6)"\n\t"    /* store NR6 at J6 */ \
00264     "psraw      $4, %%mm0 \n\t"    /* r0 = NR0 */ \
00265     "movq    %%mm5, "J(5)"\n\t"    /* store NR5 at J5 */ \
00266     "movq    %%mm7, "J(7)"\n\t"    /* store NR7 at J7 */ \
00267     "movq    %%mm0, "I(0)"\n\t"    /* store NR0 at I0 */
00268 
00269 /* Following macro does two 4x4 transposes in place.
00270 
00271   At entry (we assume):
00272 
00273     r0 = a3 a2 a1 a0
00274     I(1) = b3 b2 b1 b0
00275     r2 = c3 c2 c1 c0
00276     r3 = d3 d2 d1 d0
00277 
00278     r4 = e3 e2 e1 e0
00279     r5 = f3 f2 f1 f0
00280     r6 = g3 g2 g1 g0
00281     r7 = h3 h2 h1 h0
00282 
00283   At exit, we have:
00284 
00285     I(0) = d0 c0 b0 a0
00286     I(1) = d1 c1 b1 a1
00287     I(2) = d2 c2 b2 a2
00288     I(3) = d3 c3 b3 a3
00289 
00290     J(4) = h0 g0 f0 e0
00291     J(5) = h1 g1 f1 e1
00292     J(6) = h2 g2 f2 e2
00293     J(7) = h3 g3 f3 e3
00294 
00295    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
00296    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
00297 
00298    Since r1 is free at entry, we calculate the Js first. */
00299 #define Transpose() \
00300     "movq       %%mm4, %%mm1 \n\t"    /* r1 = e3 e2 e1 e0 */ \
00301     "punpcklwd  %%mm5, %%mm4 \n\t"    /* r4 = f1 e1 f0 e0 */ \
00302     "movq       %%mm0, "I(0)"\n\t"    /* save a3 a2 a1 a0 */ \
00303     "punpckhwd  %%mm5, %%mm1 \n\t"    /* r1 = f3 e3 f2 e2 */ \
00304     "movq       %%mm6, %%mm0 \n\t"    /* r0 = g3 g2 g1 g0 */ \
00305     "punpcklwd  %%mm7, %%mm6 \n\t"    /* r6 = h1 g1 h0 g0 */ \
00306     "movq       %%mm4, %%mm5 \n\t"    /* r5 = f1 e1 f0 e0 */ \
00307     "punpckldq  %%mm6, %%mm4 \n\t"    /* r4 = h0 g0 f0 e0 = R4 */ \
00308     "punpckhdq  %%mm6, %%mm5 \n\t"    /* r5 = h1 g1 f1 e1 = R5 */ \
00309     "movq       %%mm1, %%mm6 \n\t"    /* r6 = f3 e3 f2 e2 */ \
00310     "movq       %%mm4, "J(4)"\n\t" \
00311     "punpckhwd  %%mm7, %%mm0 \n\t"    /* r0 = h3 g3 h2 g2 */ \
00312     "movq       %%mm5, "J(5)"\n\t" \
00313     "punpckhdq  %%mm0, %%mm6 \n\t"    /* r6 = h3 g3 f3 e3 = R7 */ \
00314     "movq      "I(0)", %%mm4 \n\t"    /* r4 = a3 a2 a1 a0 */ \
00315     "punpckldq  %%mm0, %%mm1 \n\t"    /* r1 = h2 g2 f2 e2 = R6 */ \
00316     "movq      "I(1)", %%mm5 \n\t"    /* r5 = b3 b2 b1 b0 */ \
00317     "movq       %%mm4, %%mm0 \n\t"    /* r0 = a3 a2 a1 a0 */ \
00318     "movq       %%mm6, "J(7)"\n\t" \
00319     "punpcklwd  %%mm5, %%mm0 \n\t"    /* r0 = b1 a1 b0 a0 */ \
00320     "movq       %%mm1, "J(6)"\n\t" \
00321     "punpckhwd  %%mm5, %%mm4 \n\t"    /* r4 = b3 a3 b2 a2 */ \
00322     "movq       %%mm2, %%mm5 \n\t"    /* r5 = c3 c2 c1 c0 */ \
00323     "punpcklwd  %%mm3, %%mm2 \n\t"    /* r2 = d1 c1 d0 c0 */ \
00324     "movq       %%mm0, %%mm1 \n\t"    /* r1 = b1 a1 b0 a0 */ \
00325     "punpckldq  %%mm2, %%mm0 \n\t"    /* r0 = d0 c0 b0 a0 = R0 */ \
00326     "punpckhdq  %%mm2, %%mm1 \n\t"    /* r1 = d1 c1 b1 a1 = R1 */ \
00327     "movq       %%mm4, %%mm2 \n\t"    /* r2 = b3 a3 b2 a2 */ \
00328     "movq       %%mm0, "I(0)"\n\t" \
00329     "punpckhwd  %%mm3, %%mm5 \n\t"    /* r5 = d3 c3 d2 c2 */ \
00330     "movq       %%mm1, "I(1)"\n\t" \
00331     "punpckhdq  %%mm5, %%mm4 \n\t"    /* r4 = d3 c3 b3 a3 = R3 */ \
00332     "punpckldq  %%mm5, %%mm2 \n\t"    /* r2 = d2 c2 b2 a2 = R2 */ \
00333     "movq       %%mm4, "I(3)"\n\t" \
00334     "movq       %%mm2, "I(2)"\n\t"
00335 
00336 void ff_vp3_idct_mmx(int16_t *output_data)
00337 {
00338     /* eax = quantized input
00339      * ebx = dequantizer matrix
00340      * ecx = IDCT constants
00341      *  M(I) = ecx + MaskOffset(0) + I * 8
00342      *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
00343      * edx = output
00344      * r0..r7 = mm0..mm7
00345      */
00346 
00347 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
00348 #define OC_8 "%2"
00349 
00350     /* at this point, function has completed dequantization + dezigzag +
00351      * partial transposition; now do the idct itself */
00352 #define I(x) AV_STRINGIFY(16* x       )"(%0)"
00353 #define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
00354 
00355     __asm__ volatile (
00356         RowIDCT()
00357         Transpose()
00358 
00359 #undef I
00360 #undef J
00361 #define I(x) AV_STRINGIFY(16* x    + 64)"(%0)"
00362 #define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
00363 
00364         RowIDCT()
00365         Transpose()
00366 
00367 #undef I
00368 #undef J
00369 #define I(x) AV_STRINGIFY(16*x)"(%0)"
00370 #define J(x) AV_STRINGIFY(16*x)"(%0)"
00371 
00372         ColumnIDCT()
00373 
00374 #undef I
00375 #undef J
00376 #define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
00377 #define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
00378 
00379         ColumnIDCT()
00380         :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
00381     );
00382 #undef I
00383 #undef J
00384 
00385 }
00386 
00387 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
00388 {
00389     ff_vp3_idct_mmx(block);
00390     put_signed_pixels_clamped_mmx(block, dest, line_size);
00391 }
00392 
00393 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
00394 {
00395     ff_vp3_idct_mmx(block);
00396     add_pixels_clamped_mmx(block, dest, line_size);
00397 }
00398 
00399 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
00400 {
00401     int dc = block[0];
00402     dc = (46341*dc)>>16;
00403     dc = (46341*dc + (8<<16))>>20;
00404 
00405     __asm__ volatile(
00406         "movd          %3, %%mm0 \n\t"
00407         "pshufw $0, %%mm0, %%mm0 \n\t"
00408         "pxor       %%mm1, %%mm1 \n\t"
00409         "psubw      %%mm0, %%mm1 \n\t"
00410         "packuswb   %%mm0, %%mm0 \n\t"
00411         "packuswb   %%mm1, %%mm1 \n\t"
00412 
00413 #define DC_ADD \
00414         "movq        (%0), %%mm2 \n\t" \
00415         "movq     (%0,%1), %%mm3 \n\t" \
00416         "paddusb    %%mm0, %%mm2 \n\t" \
00417         "movq   (%0,%1,2), %%mm4 \n\t" \
00418         "paddusb    %%mm0, %%mm3 \n\t" \
00419         "movq     (%0,%2), %%mm5 \n\t" \
00420         "paddusb    %%mm0, %%mm4 \n\t" \
00421         "paddusb    %%mm0, %%mm5 \n\t" \
00422         "psubusb    %%mm1, %%mm2 \n\t" \
00423         "psubusb    %%mm1, %%mm3 \n\t" \
00424         "movq       %%mm2, (%0)  \n\t" \
00425         "psubusb    %%mm1, %%mm4 \n\t" \
00426         "movq       %%mm3, (%0,%1) \n\t" \
00427         "psubusb    %%mm1, %%mm5 \n\t" \
00428         "movq       %%mm4, (%0,%1,2) \n\t" \
00429         "movq       %%mm5, (%0,%2) \n\t"
00430 
00431         DC_ADD
00432         "lea    (%0,%1,4), %0 \n\t"
00433         DC_ADD
00434 
00435         : "+r"(dest)
00436         : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
00437     );
00438 }

Generated on Fri Sep 16 2011 17:17:47 for FFmpeg by  doxygen 1.7.1