• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/ppc/h264_template_altivec.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 //#define DEBUG_ALIGNMENT
00022 #ifdef DEBUG_ALIGNMENT
00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00024 #else
00025 #define ASSERT_ALIGNED(ptr) ;
00026 #endif
00027 
00028 /* this code assume that stride % 16 == 0 */
00029 
00030 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00031         vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00032         vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00033 \
00034         psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00035         psum = vec_mladd(vB, vsrc1ssH, psum);\
00036         psum = vec_mladd(vC, vsrc2ssH, psum);\
00037         psum = vec_mladd(vD, vsrc3ssH, psum);\
00038         psum = BIAS2(psum);\
00039         psum = vec_sr(psum, v6us);\
00040 \
00041         vdst = vec_ld(0, dst);\
00042         ppsum = (vec_u8)vec_pack(psum, psum);\
00043         vfdst = vec_perm(vdst, ppsum, fperm);\
00044 \
00045         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00046 \
00047         vec_st(fsum, 0, dst);\
00048 \
00049         vsrc0ssH = vsrc2ssH;\
00050         vsrc1ssH = vsrc3ssH;\
00051 \
00052         dst += stride;\
00053         src += stride;
00054 
00055 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00056 \
00057         vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00058         vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00059 \
00060         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00061         psum = vec_mladd(vE, vsrc1ssH, psum);\
00062         psum = vec_sr(psum, v6us);\
00063 \
00064         vdst = vec_ld(0, dst);\
00065         ppsum = (vec_u8)vec_pack(psum, psum);\
00066         vfdst = vec_perm(vdst, ppsum, fperm);\
00067 \
00068         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00069 \
00070         vec_st(fsum, 0, dst);\
00071 \
00072         dst += stride;\
00073         src += stride;
00074 
00075 #define noop(a) a
00076 #define add28(a) vec_add(v28ss, a)
00077 
00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00079                                     int stride, int h, int x, int y) {
00080   POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
00081     DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00082                         {((8 - x) * (8 - y)),
00083                          ((    x) * (8 - y)),
00084                          ((8 - x) * (    y)),
00085                          ((    x) * (    y))};
00086     register int i;
00087     vec_u8 fperm;
00088     const vec_s32 vABCD = vec_ld(0, ABCD);
00089     const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00090     const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00091     const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00092     const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00093     LOAD_ZERO;
00094     const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00095     const vec_u16 v6us = vec_splat_u16(6);
00096     register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00097     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00098 
00099     vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00100     vec_u8 vsrc0uc, vsrc1uc;
00101     vec_s16 vsrc0ssH, vsrc1ssH;
00102     vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00103     vec_s16 vsrc2ssH, vsrc3ssH, psum;
00104     vec_u8 vdst, ppsum, vfdst, fsum;
00105 
00106   POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00107 
00108     if (((unsigned long)dst) % 16 == 0) {
00109         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00110                          0x14, 0x15, 0x16, 0x17,
00111                          0x08, 0x09, 0x0A, 0x0B,
00112                          0x0C, 0x0D, 0x0E, 0x0F};
00113     } else {
00114         fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00115                          0x04, 0x05, 0x06, 0x07,
00116                          0x18, 0x19, 0x1A, 0x1B,
00117                          0x1C, 0x1D, 0x1E, 0x1F};
00118     }
00119 
00120     vsrcAuc = vec_ld(0, src);
00121 
00122     if (loadSecond)
00123         vsrcBuc = vec_ld(16, src);
00124     vsrcperm0 = vec_lvsl(0, src);
00125     vsrcperm1 = vec_lvsl(1, src);
00126 
00127     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00128     if (reallyBadAlign)
00129         vsrc1uc = vsrcBuc;
00130     else
00131         vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00132 
00133     vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00134     vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00135 
00136     if (ABCD[3]) {
00137         if (!loadSecond) {// -> !reallyBadAlign
00138             for (i = 0 ; i < h ; i++) {
00139                 vsrcCuc = vec_ld(stride + 0, src);
00140                 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00141                 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00142 
00143                 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00144             }
00145         } else {
00146             vec_u8 vsrcDuc;
00147             for (i = 0 ; i < h ; i++) {
00148                 vsrcCuc = vec_ld(stride + 0, src);
00149                 vsrcDuc = vec_ld(stride + 16, src);
00150                 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00151                 if (reallyBadAlign)
00152                     vsrc3uc = vsrcDuc;
00153                 else
00154                     vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00155 
00156                 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00157             }
00158         }
00159     } else {
00160         const vec_s16 vE = vec_add(vB, vC);
00161         if (ABCD[2]) { // x == 0 B == 0
00162             if (!loadSecond) {// -> !reallyBadAlign
00163                 for (i = 0 ; i < h ; i++) {
00164                     vsrcCuc = vec_ld(stride + 0, src);
00165                     vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00166                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00167 
00168                     vsrc0uc = vsrc1uc;
00169                 }
00170             } else {
00171                 vec_u8 vsrcDuc;
00172                 for (i = 0 ; i < h ; i++) {
00173                     vsrcCuc = vec_ld(stride + 0, src);
00174                     vsrcDuc = vec_ld(stride + 15, src);
00175                     vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00176                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00177 
00178                     vsrc0uc = vsrc1uc;
00179                 }
00180             }
00181         } else { // y == 0 C == 0
00182             if (!loadSecond) {// -> !reallyBadAlign
00183                 for (i = 0 ; i < h ; i++) {
00184                     vsrcCuc = vec_ld(0, src);
00185                     vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00186                     vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00187 
00188                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00189                 }
00190             } else {
00191                 vec_u8 vsrcDuc;
00192                 for (i = 0 ; i < h ; i++) {
00193                     vsrcCuc = vec_ld(0, src);
00194                     vsrcDuc = vec_ld(15, src);
00195                     vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00196                     if (reallyBadAlign)
00197                         vsrc1uc = vsrcDuc;
00198                     else
00199                         vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00200 
00201                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00202                 }
00203             }
00204         }
00205     }
00206     POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00207 }
00208 
00209 /* this code assume that stride % 16 == 0 */
00210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00211    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00212                         {((8 - x) * (8 - y)),
00213                          ((    x) * (8 - y)),
00214                          ((8 - x) * (    y)),
00215                          ((    x) * (    y))};
00216     register int i;
00217     vec_u8 fperm;
00218     const vec_s32 vABCD = vec_ld(0, ABCD);
00219     const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00220     const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00221     const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00222     const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00223     LOAD_ZERO;
00224     const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00225     const vec_u16 v6us  = vec_splat_u16(6);
00226     register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00227     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00228 
00229     vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00230     vec_u8 vsrc0uc, vsrc1uc;
00231     vec_s16 vsrc0ssH, vsrc1ssH;
00232     vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00233     vec_s16 vsrc2ssH, vsrc3ssH, psum;
00234     vec_u8 vdst, ppsum, vfdst, fsum;
00235 
00236     if (((unsigned long)dst) % 16 == 0) {
00237         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00238                          0x14, 0x15, 0x16, 0x17,
00239                          0x08, 0x09, 0x0A, 0x0B,
00240                          0x0C, 0x0D, 0x0E, 0x0F};
00241     } else {
00242         fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00243                          0x04, 0x05, 0x06, 0x07,
00244                          0x18, 0x19, 0x1A, 0x1B,
00245                          0x1C, 0x1D, 0x1E, 0x1F};
00246     }
00247 
00248     vsrcAuc = vec_ld(0, src);
00249 
00250     if (loadSecond)
00251         vsrcBuc = vec_ld(16, src);
00252     vsrcperm0 = vec_lvsl(0, src);
00253     vsrcperm1 = vec_lvsl(1, src);
00254 
00255     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00256     if (reallyBadAlign)
00257         vsrc1uc = vsrcBuc;
00258     else
00259         vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00260 
00261     vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00262     vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00263 
00264     if (!loadSecond) {// -> !reallyBadAlign
00265         for (i = 0 ; i < h ; i++) {
00266 
00267 
00268             vsrcCuc = vec_ld(stride + 0, src);
00269 
00270             vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00271             vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00272 
00273             CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00274         }
00275     } else {
00276         vec_u8 vsrcDuc;
00277         for (i = 0 ; i < h ; i++) {
00278             vsrcCuc = vec_ld(stride + 0, src);
00279             vsrcDuc = vec_ld(stride + 16, src);
00280 
00281             vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00282             if (reallyBadAlign)
00283                 vsrc3uc = vsrcDuc;
00284             else
00285                 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00286 
00287             CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00288         }
00289     }
00290 }
00291 
00292 #undef noop
00293 #undef add28
00294 #undef CHROMA_MC8_ALTIVEC_CORE
00295 
00296 /* this code assume stride % 16 == 0 */
00297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00298     POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
00299     register int i;
00300 
00301     LOAD_ZERO;
00302     const vec_u8 permM2 = vec_lvsl(-2, src);
00303     const vec_u8 permM1 = vec_lvsl(-1, src);
00304     const vec_u8 permP0 = vec_lvsl(+0, src);
00305     const vec_u8 permP1 = vec_lvsl(+1, src);
00306     const vec_u8 permP2 = vec_lvsl(+2, src);
00307     const vec_u8 permP3 = vec_lvsl(+3, src);
00308     const vec_s16 v5ss = vec_splat_s16(5);
00309     const vec_u16 v5us = vec_splat_u16(5);
00310     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00311     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00312 
00313     vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00314 
00315     register int align = ((((unsigned long)src) - 2) % 16);
00316 
00317     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00318               srcP2A, srcP2B, srcP3A, srcP3B,
00319               srcM1A, srcM1B, srcM2A, srcM2B,
00320               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00321               pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00322               psumA, psumB, sumA, sumB;
00323 
00324     vec_u8 sum, vdst, fsum;
00325 
00326     POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00327 
00328     for (i = 0 ; i < 16 ; i ++) {
00329         vec_u8 srcR1 = vec_ld(-2, src);
00330         vec_u8 srcR2 = vec_ld(14, src);
00331 
00332         switch (align) {
00333         default: {
00334             srcM2 = vec_perm(srcR1, srcR2, permM2);
00335             srcM1 = vec_perm(srcR1, srcR2, permM1);
00336             srcP0 = vec_perm(srcR1, srcR2, permP0);
00337             srcP1 = vec_perm(srcR1, srcR2, permP1);
00338             srcP2 = vec_perm(srcR1, srcR2, permP2);
00339             srcP3 = vec_perm(srcR1, srcR2, permP3);
00340         } break;
00341         case 11: {
00342             srcM2 = vec_perm(srcR1, srcR2, permM2);
00343             srcM1 = vec_perm(srcR1, srcR2, permM1);
00344             srcP0 = vec_perm(srcR1, srcR2, permP0);
00345             srcP1 = vec_perm(srcR1, srcR2, permP1);
00346             srcP2 = vec_perm(srcR1, srcR2, permP2);
00347             srcP3 = srcR2;
00348         } break;
00349         case 12: {
00350             vec_u8 srcR3 = vec_ld(30, src);
00351             srcM2 = vec_perm(srcR1, srcR2, permM2);
00352             srcM1 = vec_perm(srcR1, srcR2, permM1);
00353             srcP0 = vec_perm(srcR1, srcR2, permP0);
00354             srcP1 = vec_perm(srcR1, srcR2, permP1);
00355             srcP2 = srcR2;
00356             srcP3 = vec_perm(srcR2, srcR3, permP3);
00357         } break;
00358         case 13: {
00359             vec_u8 srcR3 = vec_ld(30, src);
00360             srcM2 = vec_perm(srcR1, srcR2, permM2);
00361             srcM1 = vec_perm(srcR1, srcR2, permM1);
00362             srcP0 = vec_perm(srcR1, srcR2, permP0);
00363             srcP1 = srcR2;
00364             srcP2 = vec_perm(srcR2, srcR3, permP2);
00365             srcP3 = vec_perm(srcR2, srcR3, permP3);
00366         } break;
00367         case 14: {
00368             vec_u8 srcR3 = vec_ld(30, src);
00369             srcM2 = vec_perm(srcR1, srcR2, permM2);
00370             srcM1 = vec_perm(srcR1, srcR2, permM1);
00371             srcP0 = srcR2;
00372             srcP1 = vec_perm(srcR2, srcR3, permP1);
00373             srcP2 = vec_perm(srcR2, srcR3, permP2);
00374             srcP3 = vec_perm(srcR2, srcR3, permP3);
00375         } break;
00376         case 15: {
00377             vec_u8 srcR3 = vec_ld(30, src);
00378             srcM2 = vec_perm(srcR1, srcR2, permM2);
00379             srcM1 = srcR2;
00380             srcP0 = vec_perm(srcR2, srcR3, permP0);
00381             srcP1 = vec_perm(srcR2, srcR3, permP1);
00382             srcP2 = vec_perm(srcR2, srcR3, permP2);
00383             srcP3 = vec_perm(srcR2, srcR3, permP3);
00384         } break;
00385         }
00386 
00387         srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00388         srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00389         srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00390         srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00391 
00392         srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00393         srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00394         srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00395         srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00396 
00397         srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00398         srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00399         srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00400         srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00401 
00402         sum1A = vec_adds(srcP0A, srcP1A);
00403         sum1B = vec_adds(srcP0B, srcP1B);
00404         sum2A = vec_adds(srcM1A, srcP2A);
00405         sum2B = vec_adds(srcM1B, srcP2B);
00406         sum3A = vec_adds(srcM2A, srcP3A);
00407         sum3B = vec_adds(srcM2B, srcP3B);
00408 
00409         pp1A = vec_mladd(sum1A, v20ss, v16ss);
00410         pp1B = vec_mladd(sum1B, v20ss, v16ss);
00411 
00412         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00413         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00414 
00415         pp3A = vec_add(sum3A, pp1A);
00416         pp3B = vec_add(sum3B, pp1B);
00417 
00418         psumA = vec_sub(pp3A, pp2A);
00419         psumB = vec_sub(pp3B, pp2B);
00420 
00421         sumA = vec_sra(psumA, v5us);
00422         sumB = vec_sra(psumB, v5us);
00423 
00424         sum = vec_packsu(sumA, sumB);
00425 
00426         ASSERT_ALIGNED(dst);
00427         vdst = vec_ld(0, dst);
00428 
00429         OP_U8_ALTIVEC(fsum, sum, vdst);
00430 
00431         vec_st(fsum, 0, dst);
00432 
00433         src += srcStride;
00434         dst += dstStride;
00435     }
00436     POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00437 }
00438 
00439 /* this code assume stride % 16 == 0 */
00440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00441     POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
00442 
00443     register int i;
00444 
00445     LOAD_ZERO;
00446     const vec_u8 perm = vec_lvsl(0, src);
00447     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00448     const vec_u16 v5us = vec_splat_u16(5);
00449     const vec_s16 v5ss = vec_splat_s16(5);
00450     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00451 
00452     uint8_t *srcbis = src - (srcStride * 2);
00453 
00454     const vec_u8 srcM2a = vec_ld(0, srcbis);
00455     const vec_u8 srcM2b = vec_ld(16, srcbis);
00456     const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00457     //srcbis += srcStride;
00458     const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00459     const vec_u8 srcM1b = vec_ld(16, srcbis);
00460     const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00461     //srcbis += srcStride;
00462     const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00463     const vec_u8 srcP0b = vec_ld(16, srcbis);
00464     const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00465     //srcbis += srcStride;
00466     const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00467     const vec_u8 srcP1b = vec_ld(16, srcbis);
00468     const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00469     //srcbis += srcStride;
00470     const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00471     const vec_u8 srcP2b = vec_ld(16, srcbis);
00472     const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00473     //srcbis += srcStride;
00474 
00475     vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00476     vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00477     vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00478     vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00479     vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00480     vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00481     vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00482     vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00483     vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00484     vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00485 
00486     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00487               psumA, psumB, sumA, sumB,
00488               srcP3ssA, srcP3ssB,
00489               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00490 
00491     vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00492 
00493     POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00494 
00495     for (i = 0 ; i < 16 ; i++) {
00496         srcP3a = vec_ld(0, srcbis += srcStride);
00497         srcP3b = vec_ld(16, srcbis);
00498         srcP3 = vec_perm(srcP3a, srcP3b, perm);
00499         srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00500         srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00501         //srcbis += srcStride;
00502 
00503         sum1A = vec_adds(srcP0ssA, srcP1ssA);
00504         sum1B = vec_adds(srcP0ssB, srcP1ssB);
00505         sum2A = vec_adds(srcM1ssA, srcP2ssA);
00506         sum2B = vec_adds(srcM1ssB, srcP2ssB);
00507         sum3A = vec_adds(srcM2ssA, srcP3ssA);
00508         sum3B = vec_adds(srcM2ssB, srcP3ssB);
00509 
00510         srcM2ssA = srcM1ssA;
00511         srcM2ssB = srcM1ssB;
00512         srcM1ssA = srcP0ssA;
00513         srcM1ssB = srcP0ssB;
00514         srcP0ssA = srcP1ssA;
00515         srcP0ssB = srcP1ssB;
00516         srcP1ssA = srcP2ssA;
00517         srcP1ssB = srcP2ssB;
00518         srcP2ssA = srcP3ssA;
00519         srcP2ssB = srcP3ssB;
00520 
00521         pp1A = vec_mladd(sum1A, v20ss, v16ss);
00522         pp1B = vec_mladd(sum1B, v20ss, v16ss);
00523 
00524         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00525         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00526 
00527         pp3A = vec_add(sum3A, pp1A);
00528         pp3B = vec_add(sum3B, pp1B);
00529 
00530         psumA = vec_sub(pp3A, pp2A);
00531         psumB = vec_sub(pp3B, pp2B);
00532 
00533         sumA = vec_sra(psumA, v5us);
00534         sumB = vec_sra(psumB, v5us);
00535 
00536         sum = vec_packsu(sumA, sumB);
00537 
00538         ASSERT_ALIGNED(dst);
00539         vdst = vec_ld(0, dst);
00540 
00541         OP_U8_ALTIVEC(fsum, sum, vdst);
00542 
00543         vec_st(fsum, 0, dst);
00544 
00545         dst += dstStride;
00546     }
00547     POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00548 }
00549 
00550 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
00551 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00552     POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00553     register int i;
00554     LOAD_ZERO;
00555     const vec_u8 permM2 = vec_lvsl(-2, src);
00556     const vec_u8 permM1 = vec_lvsl(-1, src);
00557     const vec_u8 permP0 = vec_lvsl(+0, src);
00558     const vec_u8 permP1 = vec_lvsl(+1, src);
00559     const vec_u8 permP2 = vec_lvsl(+2, src);
00560     const vec_u8 permP3 = vec_lvsl(+3, src);
00561     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00562     const vec_u32 v10ui = vec_splat_u32(10);
00563     const vec_s16 v5ss = vec_splat_s16(5);
00564     const vec_s16 v1ss = vec_splat_s16(1);
00565     const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00566     const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00567 
00568     register int align = ((((unsigned long)src) - 2) % 16);
00569 
00570     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00571               srcP2A, srcP2B, srcP3A, srcP3B,
00572               srcM1A, srcM1B, srcM2A, srcM2B,
00573               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00574               pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00575 
00576     const vec_u8 mperm = (const vec_u8)
00577         {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00578          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00579     int16_t *tmpbis = tmp;
00580 
00581     vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00582               tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00583               tmpP2ssA, tmpP2ssB;
00584 
00585     vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00586               pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00587               pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00588               ssumAe, ssumAo, ssumBe, ssumBo;
00589     vec_u8 fsum, sumv, sum, vdst;
00590     vec_s16 ssume, ssumo;
00591 
00592     POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00593     src -= (2 * srcStride);
00594     for (i = 0 ; i < 21 ; i ++) {
00595         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00596         vec_u8 srcR1 = vec_ld(-2, src);
00597         vec_u8 srcR2 = vec_ld(14, src);
00598 
00599         switch (align) {
00600         default: {
00601             srcM2 = vec_perm(srcR1, srcR2, permM2);
00602             srcM1 = vec_perm(srcR1, srcR2, permM1);
00603             srcP0 = vec_perm(srcR1, srcR2, permP0);
00604             srcP1 = vec_perm(srcR1, srcR2, permP1);
00605             srcP2 = vec_perm(srcR1, srcR2, permP2);
00606             srcP3 = vec_perm(srcR1, srcR2, permP3);
00607         } break;
00608         case 11: {
00609             srcM2 = vec_perm(srcR1, srcR2, permM2);
00610             srcM1 = vec_perm(srcR1, srcR2, permM1);
00611             srcP0 = vec_perm(srcR1, srcR2, permP0);
00612             srcP1 = vec_perm(srcR1, srcR2, permP1);
00613             srcP2 = vec_perm(srcR1, srcR2, permP2);
00614             srcP3 = srcR2;
00615         } break;
00616         case 12: {
00617             vec_u8 srcR3 = vec_ld(30, src);
00618             srcM2 = vec_perm(srcR1, srcR2, permM2);
00619             srcM1 = vec_perm(srcR1, srcR2, permM1);
00620             srcP0 = vec_perm(srcR1, srcR2, permP0);
00621             srcP1 = vec_perm(srcR1, srcR2, permP1);
00622             srcP2 = srcR2;
00623             srcP3 = vec_perm(srcR2, srcR3, permP3);
00624         } break;
00625         case 13: {
00626             vec_u8 srcR3 = vec_ld(30, src);
00627             srcM2 = vec_perm(srcR1, srcR2, permM2);
00628             srcM1 = vec_perm(srcR1, srcR2, permM1);
00629             srcP0 = vec_perm(srcR1, srcR2, permP0);
00630             srcP1 = srcR2;
00631             srcP2 = vec_perm(srcR2, srcR3, permP2);
00632             srcP3 = vec_perm(srcR2, srcR3, permP3);
00633         } break;
00634         case 14: {
00635             vec_u8 srcR3 = vec_ld(30, src);
00636             srcM2 = vec_perm(srcR1, srcR2, permM2);
00637             srcM1 = vec_perm(srcR1, srcR2, permM1);
00638             srcP0 = srcR2;
00639             srcP1 = vec_perm(srcR2, srcR3, permP1);
00640             srcP2 = vec_perm(srcR2, srcR3, permP2);
00641             srcP3 = vec_perm(srcR2, srcR3, permP3);
00642         } break;
00643         case 15: {
00644             vec_u8 srcR3 = vec_ld(30, src);
00645             srcM2 = vec_perm(srcR1, srcR2, permM2);
00646             srcM1 = srcR2;
00647             srcP0 = vec_perm(srcR2, srcR3, permP0);
00648             srcP1 = vec_perm(srcR2, srcR3, permP1);
00649             srcP2 = vec_perm(srcR2, srcR3, permP2);
00650             srcP3 = vec_perm(srcR2, srcR3, permP3);
00651         } break;
00652         }
00653 
00654         srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00655         srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00656         srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00657         srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00658 
00659         srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00660         srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00661         srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00662         srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00663 
00664         srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00665         srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00666         srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00667         srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00668 
00669         sum1A = vec_adds(srcP0A, srcP1A);
00670         sum1B = vec_adds(srcP0B, srcP1B);
00671         sum2A = vec_adds(srcM1A, srcP2A);
00672         sum2B = vec_adds(srcM1B, srcP2B);
00673         sum3A = vec_adds(srcM2A, srcP3A);
00674         sum3B = vec_adds(srcM2B, srcP3B);
00675 
00676         pp1A = vec_mladd(sum1A, v20ss, sum3A);
00677         pp1B = vec_mladd(sum1B, v20ss, sum3B);
00678 
00679         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00680         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00681 
00682         psumA = vec_sub(pp1A, pp2A);
00683         psumB = vec_sub(pp1B, pp2B);
00684 
00685         vec_st(psumA, 0, tmp);
00686         vec_st(psumB, 16, tmp);
00687 
00688         src += srcStride;
00689         tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
00690     }
00691 
00692     tmpM2ssA = vec_ld(0, tmpbis);
00693     tmpM2ssB = vec_ld(16, tmpbis);
00694     tmpbis += tmpStride;
00695     tmpM1ssA = vec_ld(0, tmpbis);
00696     tmpM1ssB = vec_ld(16, tmpbis);
00697     tmpbis += tmpStride;
00698     tmpP0ssA = vec_ld(0, tmpbis);
00699     tmpP0ssB = vec_ld(16, tmpbis);
00700     tmpbis += tmpStride;
00701     tmpP1ssA = vec_ld(0, tmpbis);
00702     tmpP1ssB = vec_ld(16, tmpbis);
00703     tmpbis += tmpStride;
00704     tmpP2ssA = vec_ld(0, tmpbis);
00705     tmpP2ssB = vec_ld(16, tmpbis);
00706     tmpbis += tmpStride;
00707 
00708     for (i = 0 ; i < 16 ; i++) {
00709         const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00710         const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00711 
00712         const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00713         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00714         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00715         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00716         const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00717         const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00718 
00719         tmpbis += tmpStride;
00720 
00721         tmpM2ssA = tmpM1ssA;
00722         tmpM2ssB = tmpM1ssB;
00723         tmpM1ssA = tmpP0ssA;
00724         tmpM1ssB = tmpP0ssB;
00725         tmpP0ssA = tmpP1ssA;
00726         tmpP0ssB = tmpP1ssB;
00727         tmpP1ssA = tmpP2ssA;
00728         tmpP1ssB = tmpP2ssB;
00729         tmpP2ssA = tmpP3ssA;
00730         tmpP2ssB = tmpP3ssB;
00731 
00732         pp1Ae = vec_mule(sum1A, v20ss);
00733         pp1Ao = vec_mulo(sum1A, v20ss);
00734         pp1Be = vec_mule(sum1B, v20ss);
00735         pp1Bo = vec_mulo(sum1B, v20ss);
00736 
00737         pp2Ae = vec_mule(sum2A, v5ss);
00738         pp2Ao = vec_mulo(sum2A, v5ss);
00739         pp2Be = vec_mule(sum2B, v5ss);
00740         pp2Bo = vec_mulo(sum2B, v5ss);
00741 
00742         pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00743         pp3Ao = vec_mulo(sum3A, v1ss);
00744         pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00745         pp3Bo = vec_mulo(sum3B, v1ss);
00746 
00747         pp1cAe = vec_add(pp1Ae, v512si);
00748         pp1cAo = vec_add(pp1Ao, v512si);
00749         pp1cBe = vec_add(pp1Be, v512si);
00750         pp1cBo = vec_add(pp1Bo, v512si);
00751 
00752         pp32Ae = vec_sub(pp3Ae, pp2Ae);
00753         pp32Ao = vec_sub(pp3Ao, pp2Ao);
00754         pp32Be = vec_sub(pp3Be, pp2Be);
00755         pp32Bo = vec_sub(pp3Bo, pp2Bo);
00756 
00757         sumAe = vec_add(pp1cAe, pp32Ae);
00758         sumAo = vec_add(pp1cAo, pp32Ao);
00759         sumBe = vec_add(pp1cBe, pp32Be);
00760         sumBo = vec_add(pp1cBo, pp32Bo);
00761 
00762         ssumAe = vec_sra(sumAe, v10ui);
00763         ssumAo = vec_sra(sumAo, v10ui);
00764         ssumBe = vec_sra(sumBe, v10ui);
00765         ssumBo = vec_sra(sumBo, v10ui);
00766 
00767         ssume = vec_packs(ssumAe, ssumBe);
00768         ssumo = vec_packs(ssumAo, ssumBo);
00769 
00770         sumv = vec_packsu(ssume, ssumo);
00771         sum = vec_perm(sumv, sumv, mperm);
00772 
00773         ASSERT_ALIGNED(dst);
00774         vdst = vec_ld(0, dst);
00775 
00776         OP_U8_ALTIVEC(fsum, sum, vdst);
00777 
00778         vec_st(fsum, 0, dst);
00779 
00780         dst += dstStride;
00781     }
00782     POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00783 }

Generated on Fri Sep 16 2011 17:17:41 for FFmpeg by  doxygen 1.7.1