• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/ppc/gmc_altivec.c

Go to the documentation of this file.
00001 /*
00002  * GMC (Global Motion Compensation)
00003  * AltiVec-enabled
00004  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 #include "libavcodec/dsputil.h"
00024 #include "dsputil_ppc.h"
00025 #include "util_altivec.h"
00026 #include "types_altivec.h"
00027 #include "dsputil_altivec.h"
00028 
00029 /*
00030   altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
00031   to preserve proper dst alignment.
00032 */
00033 #define GMC1_PERF_COND (h==8)
00034 void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
00035 {
00036 POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
00037     const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;
00038     const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] =
00039         {
00040             (16-x16)*(16-y16), /* A */
00041             (   x16)*(16-y16), /* B */
00042             (16-x16)*(   y16), /* C */
00043             (   x16)*(   y16), /* D */
00044             0, 0, 0, 0         /* padding */
00045         };
00046     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
00047     register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
00048     register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
00049     register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
00050     int i;
00051     unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
00052     unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
00053 
00054 
00055 POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
00056 
00057     tempA = vec_ld(0, (unsigned short*)ABCD);
00058     Av = vec_splat(tempA, 0);
00059     Bv = vec_splat(tempA, 1);
00060     Cv = vec_splat(tempA, 2);
00061     Dv = vec_splat(tempA, 3);
00062 
00063     rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);
00064 
00065     // we'll be able to pick-up our 9 char elements
00066     // at src from those 32 bytes
00067     // we load the first batch here, as inside the loop
00068     // we can re-use 'src+stride' from one iteration
00069     // as the 'src' of the next.
00070     src_0 = vec_ld(0, src);
00071     src_1 = vec_ld(16, src);
00072     srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
00073 
00074     if (src_really_odd != 0x0000000F) {
00075         // if src & 0xF == 0xF, then (src+1) is properly aligned
00076         // on the second vector.
00077         srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
00078     } else {
00079         srcvB = src_1;
00080     }
00081     srcvA = vec_mergeh(vczero, srcvA);
00082     srcvB = vec_mergeh(vczero, srcvB);
00083 
00084     for(i=0; i<h; i++) {
00085         dst_odd = (unsigned long)dst & 0x0000000F;
00086         src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
00087 
00088         dstv = vec_ld(0, dst);
00089 
00090         // we we'll be able to pick-up our 9 char elements
00091         // at src + stride from those 32 bytes
00092         // then reuse the resulting 2 vectors srvcC and srcvD
00093         // as the next srcvA and srcvB
00094         src_0 = vec_ld(stride + 0, src);
00095         src_1 = vec_ld(stride + 16, src);
00096         srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
00097 
00098         if (src_really_odd != 0x0000000F) {
00099             // if src & 0xF == 0xF, then (src+1) is properly aligned
00100             // on the second vector.
00101             srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
00102         } else {
00103             srcvD = src_1;
00104         }
00105 
00106         srcvC = vec_mergeh(vczero, srcvC);
00107         srcvD = vec_mergeh(vczero, srcvD);
00108 
00109 
00110         // OK, now we (finally) do the math :-)
00111         // those four instructions replaces 32 int muls & 32 int adds.
00112         // isn't AltiVec nice ?
00113         tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
00114         tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
00115         tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
00116         tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
00117 
00118         srcvA = srcvC;
00119         srcvB = srcvD;
00120 
00121         tempD = vec_sr(tempD, vcsr8);
00122 
00123         dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
00124 
00125         if (dst_odd) {
00126             dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
00127         } else {
00128             dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
00129         }
00130 
00131         vec_st(dstv2, 0, dst);
00132 
00133         dst += stride;
00134         src += stride;
00135     }
00136 
00137 POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
00138 }

Generated on Fri Sep 16 2011 17:17:41 for FFmpeg by  doxygen 1.7.1