Libav
|
00001 /* 00002 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 00003 * 00004 * This file is part of FFmpeg. 00005 * 00006 * FFmpeg is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * FFmpeg is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with FFmpeg; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 //#define DEBUG_ALIGNMENT 00022 #ifdef DEBUG_ALIGNMENT 00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 00024 #else 00025 #define ASSERT_ALIGNED(ptr) ; 00026 #endif 00027 00028 /* this code assume that stride % 16 == 0 */ 00029 00030 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ 00031 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 00032 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 00033 \ 00034 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ 00035 psum = vec_mladd(vB, vsrc1ssH, psum);\ 00036 psum = vec_mladd(vC, vsrc2ssH, psum);\ 00037 psum = vec_mladd(vD, vsrc3ssH, psum);\ 00038 psum = BIAS2(psum);\ 00039 psum = vec_sr(psum, v6us);\ 00040 \ 00041 vdst = vec_ld(0, dst);\ 00042 ppsum = (vec_u8)vec_pack(psum, psum);\ 00043 vfdst = vec_perm(vdst, ppsum, fperm);\ 00044 \ 00045 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 00046 \ 00047 vec_st(fsum, 0, dst);\ 00048 \ 00049 vsrc0ssH = vsrc2ssH;\ 00050 vsrc1ssH = vsrc3ssH;\ 00051 \ 00052 dst += stride;\ 00053 src += stride; 00054 00055 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 00056 \ 00057 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ 00058 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ 00059 \ 00060 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 00061 psum = vec_mladd(vE, vsrc1ssH, psum);\ 00062 psum = vec_sr(psum, v6us);\ 00063 \ 00064 vdst = vec_ld(0, dst);\ 00065 ppsum = (vec_u8)vec_pack(psum, psum);\ 00066 vfdst = vec_perm(vdst, ppsum, fperm);\ 00067 \ 00068 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 00069 \ 00070 vec_st(fsum, 0, dst);\ 00071 \ 00072 dst += stride;\ 00073 src += stride; 00074 00075 #define noop(a) a 00076 #define add28(a) vec_add(v28ss, a) 00077 00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 00079 int stride, int h, int x, int y) { 00080 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 00081 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 00082 {((8 - x) * (8 - y)), 00083 (( x) * (8 - y)), 00084 ((8 - x) * ( y)), 00085 (( x) * ( y))}; 00086 register int i; 00087 vec_u8 fperm; 00088 const vec_s32 vABCD = vec_ld(0, ABCD); 00089 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 00090 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 00091 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 00092 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 00093 LOAD_ZERO; 00094 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 00095 const vec_u16 v6us = vec_splat_u16(6); 00096 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 00097 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 00098 00099 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 00100 vec_u8 vsrc0uc, vsrc1uc; 00101 vec_s16 vsrc0ssH, vsrc1ssH; 00102 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 00103 vec_s16 vsrc2ssH, vsrc3ssH, psum; 00104 vec_u8 vdst, ppsum, vfdst, fsum; 00105 00106 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 00107 00108 if (((unsigned long)dst) % 16 == 0) { 00109 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 00110 0x14, 0x15, 0x16, 0x17, 00111 0x08, 0x09, 0x0A, 0x0B, 00112 0x0C, 0x0D, 0x0E, 0x0F}; 00113 } else { 00114 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 00115 0x04, 0x05, 0x06, 0x07, 00116 0x18, 0x19, 0x1A, 0x1B, 00117 0x1C, 0x1D, 0x1E, 0x1F}; 00118 } 00119 00120 vsrcAuc = vec_ld(0, src); 00121 00122 if (loadSecond) 00123 vsrcBuc = vec_ld(16, src); 00124 vsrcperm0 = vec_lvsl(0, src); 00125 vsrcperm1 = vec_lvsl(1, src); 00126 00127 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 00128 if (reallyBadAlign) 00129 vsrc1uc = vsrcBuc; 00130 else 00131 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 00132 00133 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); 00134 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); 00135 00136 if (ABCD[3]) { 00137 if (!loadSecond) {// -> !reallyBadAlign 00138 for (i = 0 ; i < h ; i++) { 00139 vsrcCuc = vec_ld(stride + 0, src); 00140 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00141 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00142 00143 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 00144 } 00145 } else { 00146 vec_u8 vsrcDuc; 00147 for (i = 0 ; i < h ; i++) { 00148 vsrcCuc = vec_ld(stride + 0, src); 00149 vsrcDuc = vec_ld(stride + 16, src); 00150 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00151 if (reallyBadAlign) 00152 vsrc3uc = vsrcDuc; 00153 else 00154 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00155 00156 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 00157 } 00158 } 00159 } else { 00160 const vec_s16 vE = vec_add(vB, vC); 00161 if (ABCD[2]) { // x == 0 B == 0 00162 if (!loadSecond) {// -> !reallyBadAlign 00163 for (i = 0 ; i < h ; i++) { 00164 vsrcCuc = vec_ld(stride + 0, src); 00165 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00166 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00167 00168 vsrc0uc = vsrc1uc; 00169 } 00170 } else { 00171 vec_u8 vsrcDuc; 00172 for (i = 0 ; i < h ; i++) { 00173 vsrcCuc = vec_ld(stride + 0, src); 00174 vsrcDuc = vec_ld(stride + 15, src); 00175 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00176 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00177 00178 vsrc0uc = vsrc1uc; 00179 } 00180 } 00181 } else { // y == 0 C == 0 00182 if (!loadSecond) {// -> !reallyBadAlign 00183 for (i = 0 ; i < h ; i++) { 00184 vsrcCuc = vec_ld(0, src); 00185 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00186 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00187 00188 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00189 } 00190 } else { 00191 vec_u8 vsrcDuc; 00192 for (i = 0 ; i < h ; i++) { 00193 vsrcCuc = vec_ld(0, src); 00194 vsrcDuc = vec_ld(15, src); 00195 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00196 if (reallyBadAlign) 00197 vsrc1uc = vsrcDuc; 00198 else 00199 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00200 00201 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00202 } 00203 } 00204 } 00205 } 00206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); 00207 } 00208 00209 /* this code assume that stride % 16 == 0 */ 00210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { 00211 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 00212 {((8 - x) * (8 - y)), 00213 (( x) * (8 - y)), 00214 ((8 - x) * ( y)), 00215 (( x) * ( y))}; 00216 register int i; 00217 vec_u8 fperm; 00218 const vec_s32 vABCD = vec_ld(0, ABCD); 00219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 00220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 00221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 00222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 00223 LOAD_ZERO; 00224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 00225 const vec_u16 v6us = vec_splat_u16(6); 00226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 00227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 00228 00229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 00230 vec_u8 vsrc0uc, vsrc1uc; 00231 vec_s16 vsrc0ssH, vsrc1ssH; 00232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 00233 vec_s16 vsrc2ssH, vsrc3ssH, psum; 00234 vec_u8 vdst, ppsum, vfdst, fsum; 00235 00236 if (((unsigned long)dst) % 16 == 0) { 00237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 00238 0x14, 0x15, 0x16, 0x17, 00239 0x08, 0x09, 0x0A, 0x0B, 00240 0x0C, 0x0D, 0x0E, 0x0F}; 00241 } else { 00242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 00243 0x04, 0x05, 0x06, 0x07, 00244 0x18, 0x19, 0x1A, 0x1B, 00245 0x1C, 0x1D, 0x1E, 0x1F}; 00246 } 00247 00248 vsrcAuc = vec_ld(0, src); 00249 00250 if (loadSecond) 00251 vsrcBuc = vec_ld(16, src); 00252 vsrcperm0 = vec_lvsl(0, src); 00253 vsrcperm1 = vec_lvsl(1, src); 00254 00255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 00256 if (reallyBadAlign) 00257 vsrc1uc = vsrcBuc; 00258 else 00259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 00260 00261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); 00262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); 00263 00264 if (!loadSecond) {// -> !reallyBadAlign 00265 for (i = 0 ; i < h ; i++) { 00266 00267 00268 vsrcCuc = vec_ld(stride + 0, src); 00269 00270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00272 00273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 00274 } 00275 } else { 00276 vec_u8 vsrcDuc; 00277 for (i = 0 ; i < h ; i++) { 00278 vsrcCuc = vec_ld(stride + 0, src); 00279 vsrcDuc = vec_ld(stride + 16, src); 00280 00281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00282 if (reallyBadAlign) 00283 vsrc3uc = vsrcDuc; 00284 else 00285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00286 00287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 00288 } 00289 } 00290 } 00291 00292 #undef noop 00293 #undef add28 00294 #undef CHROMA_MC8_ALTIVEC_CORE 00295 00296 /* this code assume stride % 16 == 0 */ 00297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 00298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 00299 register int i; 00300 00301 LOAD_ZERO; 00302 const vec_u8 permM2 = vec_lvsl(-2, src); 00303 const vec_u8 permM1 = vec_lvsl(-1, src); 00304 const vec_u8 permP0 = vec_lvsl(+0, src); 00305 const vec_u8 permP1 = vec_lvsl(+1, src); 00306 const vec_u8 permP2 = vec_lvsl(+2, src); 00307 const vec_u8 permP3 = vec_lvsl(+3, src); 00308 const vec_s16 v5ss = vec_splat_s16(5); 00309 const vec_u16 v5us = vec_splat_u16(5); 00310 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00311 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 00312 00313 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 00314 00315 register int align = ((((unsigned long)src) - 2) % 16); 00316 00317 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 00318 srcP2A, srcP2B, srcP3A, srcP3B, 00319 srcM1A, srcM1B, srcM2A, srcM2B, 00320 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 00321 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 00322 psumA, psumB, sumA, sumB; 00323 00324 vec_u8 sum, vdst, fsum; 00325 00326 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 00327 00328 for (i = 0 ; i < 16 ; i ++) { 00329 vec_u8 srcR1 = vec_ld(-2, src); 00330 vec_u8 srcR2 = vec_ld(14, src); 00331 00332 switch (align) { 00333 default: { 00334 srcM2 = vec_perm(srcR1, srcR2, permM2); 00335 srcM1 = vec_perm(srcR1, srcR2, permM1); 00336 srcP0 = vec_perm(srcR1, srcR2, permP0); 00337 srcP1 = vec_perm(srcR1, srcR2, permP1); 00338 srcP2 = vec_perm(srcR1, srcR2, permP2); 00339 srcP3 = vec_perm(srcR1, srcR2, permP3); 00340 } break; 00341 case 11: { 00342 srcM2 = vec_perm(srcR1, srcR2, permM2); 00343 srcM1 = vec_perm(srcR1, srcR2, permM1); 00344 srcP0 = vec_perm(srcR1, srcR2, permP0); 00345 srcP1 = vec_perm(srcR1, srcR2, permP1); 00346 srcP2 = vec_perm(srcR1, srcR2, permP2); 00347 srcP3 = srcR2; 00348 } break; 00349 case 12: { 00350 vec_u8 srcR3 = vec_ld(30, src); 00351 srcM2 = vec_perm(srcR1, srcR2, permM2); 00352 srcM1 = vec_perm(srcR1, srcR2, permM1); 00353 srcP0 = vec_perm(srcR1, srcR2, permP0); 00354 srcP1 = vec_perm(srcR1, srcR2, permP1); 00355 srcP2 = srcR2; 00356 srcP3 = vec_perm(srcR2, srcR3, permP3); 00357 } break; 00358 case 13: { 00359 vec_u8 srcR3 = vec_ld(30, src); 00360 srcM2 = vec_perm(srcR1, srcR2, permM2); 00361 srcM1 = vec_perm(srcR1, srcR2, permM1); 00362 srcP0 = vec_perm(srcR1, srcR2, permP0); 00363 srcP1 = srcR2; 00364 srcP2 = vec_perm(srcR2, srcR3, permP2); 00365 srcP3 = vec_perm(srcR2, srcR3, permP3); 00366 } break; 00367 case 14: { 00368 vec_u8 srcR3 = vec_ld(30, src); 00369 srcM2 = vec_perm(srcR1, srcR2, permM2); 00370 srcM1 = vec_perm(srcR1, srcR2, permM1); 00371 srcP0 = srcR2; 00372 srcP1 = vec_perm(srcR2, srcR3, permP1); 00373 srcP2 = vec_perm(srcR2, srcR3, permP2); 00374 srcP3 = vec_perm(srcR2, srcR3, permP3); 00375 } break; 00376 case 15: { 00377 vec_u8 srcR3 = vec_ld(30, src); 00378 srcM2 = vec_perm(srcR1, srcR2, permM2); 00379 srcM1 = srcR2; 00380 srcP0 = vec_perm(srcR2, srcR3, permP0); 00381 srcP1 = vec_perm(srcR2, srcR3, permP1); 00382 srcP2 = vec_perm(srcR2, srcR3, permP2); 00383 srcP3 = vec_perm(srcR2, srcR3, permP3); 00384 } break; 00385 } 00386 00387 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00388 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 00389 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00390 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 00391 00392 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00393 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 00394 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00395 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 00396 00397 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00398 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 00399 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00400 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 00401 00402 sum1A = vec_adds(srcP0A, srcP1A); 00403 sum1B = vec_adds(srcP0B, srcP1B); 00404 sum2A = vec_adds(srcM1A, srcP2A); 00405 sum2B = vec_adds(srcM1B, srcP2B); 00406 sum3A = vec_adds(srcM2A, srcP3A); 00407 sum3B = vec_adds(srcM2B, srcP3B); 00408 00409 pp1A = vec_mladd(sum1A, v20ss, v16ss); 00410 pp1B = vec_mladd(sum1B, v20ss, v16ss); 00411 00412 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00413 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00414 00415 pp3A = vec_add(sum3A, pp1A); 00416 pp3B = vec_add(sum3B, pp1B); 00417 00418 psumA = vec_sub(pp3A, pp2A); 00419 psumB = vec_sub(pp3B, pp2B); 00420 00421 sumA = vec_sra(psumA, v5us); 00422 sumB = vec_sra(psumB, v5us); 00423 00424 sum = vec_packsu(sumA, sumB); 00425 00426 ASSERT_ALIGNED(dst); 00427 vdst = vec_ld(0, dst); 00428 00429 OP_U8_ALTIVEC(fsum, sum, vdst); 00430 00431 vec_st(fsum, 0, dst); 00432 00433 src += srcStride; 00434 dst += dstStride; 00435 } 00436 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 00437 } 00438 00439 /* this code assume stride % 16 == 0 */ 00440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 00441 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 00442 00443 register int i; 00444 00445 LOAD_ZERO; 00446 const vec_u8 perm = vec_lvsl(0, src); 00447 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00448 const vec_u16 v5us = vec_splat_u16(5); 00449 const vec_s16 v5ss = vec_splat_s16(5); 00450 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 00451 00452 uint8_t *srcbis = src - (srcStride * 2); 00453 00454 const vec_u8 srcM2a = vec_ld(0, srcbis); 00455 const vec_u8 srcM2b = vec_ld(16, srcbis); 00456 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 00457 //srcbis += srcStride; 00458 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 00459 const vec_u8 srcM1b = vec_ld(16, srcbis); 00460 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 00461 //srcbis += srcStride; 00462 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 00463 const vec_u8 srcP0b = vec_ld(16, srcbis); 00464 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 00465 //srcbis += srcStride; 00466 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 00467 const vec_u8 srcP1b = vec_ld(16, srcbis); 00468 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 00469 //srcbis += srcStride; 00470 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 00471 const vec_u8 srcP2b = vec_ld(16, srcbis); 00472 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 00473 //srcbis += srcStride; 00474 00475 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00476 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 00477 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00478 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 00479 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00480 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 00481 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00482 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 00483 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00484 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 00485 00486 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 00487 psumA, psumB, sumA, sumB, 00488 srcP3ssA, srcP3ssB, 00489 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 00490 00491 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; 00492 00493 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 00494 00495 for (i = 0 ; i < 16 ; i++) { 00496 srcP3a = vec_ld(0, srcbis += srcStride); 00497 srcP3b = vec_ld(16, srcbis); 00498 srcP3 = vec_perm(srcP3a, srcP3b, perm); 00499 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00500 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 00501 //srcbis += srcStride; 00502 00503 sum1A = vec_adds(srcP0ssA, srcP1ssA); 00504 sum1B = vec_adds(srcP0ssB, srcP1ssB); 00505 sum2A = vec_adds(srcM1ssA, srcP2ssA); 00506 sum2B = vec_adds(srcM1ssB, srcP2ssB); 00507 sum3A = vec_adds(srcM2ssA, srcP3ssA); 00508 sum3B = vec_adds(srcM2ssB, srcP3ssB); 00509 00510 srcM2ssA = srcM1ssA; 00511 srcM2ssB = srcM1ssB; 00512 srcM1ssA = srcP0ssA; 00513 srcM1ssB = srcP0ssB; 00514 srcP0ssA = srcP1ssA; 00515 srcP0ssB = srcP1ssB; 00516 srcP1ssA = srcP2ssA; 00517 srcP1ssB = srcP2ssB; 00518 srcP2ssA = srcP3ssA; 00519 srcP2ssB = srcP3ssB; 00520 00521 pp1A = vec_mladd(sum1A, v20ss, v16ss); 00522 pp1B = vec_mladd(sum1B, v20ss, v16ss); 00523 00524 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00525 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00526 00527 pp3A = vec_add(sum3A, pp1A); 00528 pp3B = vec_add(sum3B, pp1B); 00529 00530 psumA = vec_sub(pp3A, pp2A); 00531 psumB = vec_sub(pp3B, pp2B); 00532 00533 sumA = vec_sra(psumA, v5us); 00534 sumB = vec_sra(psumB, v5us); 00535 00536 sum = vec_packsu(sumA, sumB); 00537 00538 ASSERT_ALIGNED(dst); 00539 vdst = vec_ld(0, dst); 00540 00541 OP_U8_ALTIVEC(fsum, sum, vdst); 00542 00543 vec_st(fsum, 0, dst); 00544 00545 dst += dstStride; 00546 } 00547 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 00548 } 00549 00550 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 00551 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 00552 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 00553 register int i; 00554 LOAD_ZERO; 00555 const vec_u8 permM2 = vec_lvsl(-2, src); 00556 const vec_u8 permM1 = vec_lvsl(-1, src); 00557 const vec_u8 permP0 = vec_lvsl(+0, src); 00558 const vec_u8 permP1 = vec_lvsl(+1, src); 00559 const vec_u8 permP2 = vec_lvsl(+2, src); 00560 const vec_u8 permP3 = vec_lvsl(+3, src); 00561 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00562 const vec_u32 v10ui = vec_splat_u32(10); 00563 const vec_s16 v5ss = vec_splat_s16(5); 00564 const vec_s16 v1ss = vec_splat_s16(1); 00565 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 00566 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 00567 00568 register int align = ((((unsigned long)src) - 2) % 16); 00569 00570 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 00571 srcP2A, srcP2B, srcP3A, srcP3B, 00572 srcM1A, srcM1B, srcM2A, srcM2B, 00573 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 00574 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 00575 00576 const vec_u8 mperm = (const vec_u8) 00577 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 00578 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 00579 int16_t *tmpbis = tmp; 00580 00581 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 00582 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 00583 tmpP2ssA, tmpP2ssB; 00584 00585 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 00586 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 00587 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 00588 ssumAe, ssumAo, ssumBe, ssumBo; 00589 vec_u8 fsum, sumv, sum, vdst; 00590 vec_s16 ssume, ssumo; 00591 00592 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 00593 src -= (2 * srcStride); 00594 for (i = 0 ; i < 21 ; i ++) { 00595 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 00596 vec_u8 srcR1 = vec_ld(-2, src); 00597 vec_u8 srcR2 = vec_ld(14, src); 00598 00599 switch (align) { 00600 default: { 00601 srcM2 = vec_perm(srcR1, srcR2, permM2); 00602 srcM1 = vec_perm(srcR1, srcR2, permM1); 00603 srcP0 = vec_perm(srcR1, srcR2, permP0); 00604 srcP1 = vec_perm(srcR1, srcR2, permP1); 00605 srcP2 = vec_perm(srcR1, srcR2, permP2); 00606 srcP3 = vec_perm(srcR1, srcR2, permP3); 00607 } break; 00608 case 11: { 00609 srcM2 = vec_perm(srcR1, srcR2, permM2); 00610 srcM1 = vec_perm(srcR1, srcR2, permM1); 00611 srcP0 = vec_perm(srcR1, srcR2, permP0); 00612 srcP1 = vec_perm(srcR1, srcR2, permP1); 00613 srcP2 = vec_perm(srcR1, srcR2, permP2); 00614 srcP3 = srcR2; 00615 } break; 00616 case 12: { 00617 vec_u8 srcR3 = vec_ld(30, src); 00618 srcM2 = vec_perm(srcR1, srcR2, permM2); 00619 srcM1 = vec_perm(srcR1, srcR2, permM1); 00620 srcP0 = vec_perm(srcR1, srcR2, permP0); 00621 srcP1 = vec_perm(srcR1, srcR2, permP1); 00622 srcP2 = srcR2; 00623 srcP3 = vec_perm(srcR2, srcR3, permP3); 00624 } break; 00625 case 13: { 00626 vec_u8 srcR3 = vec_ld(30, src); 00627 srcM2 = vec_perm(srcR1, srcR2, permM2); 00628 srcM1 = vec_perm(srcR1, srcR2, permM1); 00629 srcP0 = vec_perm(srcR1, srcR2, permP0); 00630 srcP1 = srcR2; 00631 srcP2 = vec_perm(srcR2, srcR3, permP2); 00632 srcP3 = vec_perm(srcR2, srcR3, permP3); 00633 } break; 00634 case 14: { 00635 vec_u8 srcR3 = vec_ld(30, src); 00636 srcM2 = vec_perm(srcR1, srcR2, permM2); 00637 srcM1 = vec_perm(srcR1, srcR2, permM1); 00638 srcP0 = srcR2; 00639 srcP1 = vec_perm(srcR2, srcR3, permP1); 00640 srcP2 = vec_perm(srcR2, srcR3, permP2); 00641 srcP3 = vec_perm(srcR2, srcR3, permP3); 00642 } break; 00643 case 15: { 00644 vec_u8 srcR3 = vec_ld(30, src); 00645 srcM2 = vec_perm(srcR1, srcR2, permM2); 00646 srcM1 = srcR2; 00647 srcP0 = vec_perm(srcR2, srcR3, permP0); 00648 srcP1 = vec_perm(srcR2, srcR3, permP1); 00649 srcP2 = vec_perm(srcR2, srcR3, permP2); 00650 srcP3 = vec_perm(srcR2, srcR3, permP3); 00651 } break; 00652 } 00653 00654 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00655 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 00656 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00657 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 00658 00659 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00660 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 00661 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00662 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 00663 00664 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00665 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 00666 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00667 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 00668 00669 sum1A = vec_adds(srcP0A, srcP1A); 00670 sum1B = vec_adds(srcP0B, srcP1B); 00671 sum2A = vec_adds(srcM1A, srcP2A); 00672 sum2B = vec_adds(srcM1B, srcP2B); 00673 sum3A = vec_adds(srcM2A, srcP3A); 00674 sum3B = vec_adds(srcM2B, srcP3B); 00675 00676 pp1A = vec_mladd(sum1A, v20ss, sum3A); 00677 pp1B = vec_mladd(sum1B, v20ss, sum3B); 00678 00679 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00680 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00681 00682 psumA = vec_sub(pp1A, pp2A); 00683 psumB = vec_sub(pp1B, pp2B); 00684 00685 vec_st(psumA, 0, tmp); 00686 vec_st(psumB, 16, tmp); 00687 00688 src += srcStride; 00689 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 00690 } 00691 00692 tmpM2ssA = vec_ld(0, tmpbis); 00693 tmpM2ssB = vec_ld(16, tmpbis); 00694 tmpbis += tmpStride; 00695 tmpM1ssA = vec_ld(0, tmpbis); 00696 tmpM1ssB = vec_ld(16, tmpbis); 00697 tmpbis += tmpStride; 00698 tmpP0ssA = vec_ld(0, tmpbis); 00699 tmpP0ssB = vec_ld(16, tmpbis); 00700 tmpbis += tmpStride; 00701 tmpP1ssA = vec_ld(0, tmpbis); 00702 tmpP1ssB = vec_ld(16, tmpbis); 00703 tmpbis += tmpStride; 00704 tmpP2ssA = vec_ld(0, tmpbis); 00705 tmpP2ssB = vec_ld(16, tmpbis); 00706 tmpbis += tmpStride; 00707 00708 for (i = 0 ; i < 16 ; i++) { 00709 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 00710 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 00711 00712 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 00713 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 00714 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 00715 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 00716 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 00717 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 00718 00719 tmpbis += tmpStride; 00720 00721 tmpM2ssA = tmpM1ssA; 00722 tmpM2ssB = tmpM1ssB; 00723 tmpM1ssA = tmpP0ssA; 00724 tmpM1ssB = tmpP0ssB; 00725 tmpP0ssA = tmpP1ssA; 00726 tmpP0ssB = tmpP1ssB; 00727 tmpP1ssA = tmpP2ssA; 00728 tmpP1ssB = tmpP2ssB; 00729 tmpP2ssA = tmpP3ssA; 00730 tmpP2ssB = tmpP3ssB; 00731 00732 pp1Ae = vec_mule(sum1A, v20ss); 00733 pp1Ao = vec_mulo(sum1A, v20ss); 00734 pp1Be = vec_mule(sum1B, v20ss); 00735 pp1Bo = vec_mulo(sum1B, v20ss); 00736 00737 pp2Ae = vec_mule(sum2A, v5ss); 00738 pp2Ao = vec_mulo(sum2A, v5ss); 00739 pp2Be = vec_mule(sum2B, v5ss); 00740 pp2Bo = vec_mulo(sum2B, v5ss); 00741 00742 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 00743 pp3Ao = vec_mulo(sum3A, v1ss); 00744 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 00745 pp3Bo = vec_mulo(sum3B, v1ss); 00746 00747 pp1cAe = vec_add(pp1Ae, v512si); 00748 pp1cAo = vec_add(pp1Ao, v512si); 00749 pp1cBe = vec_add(pp1Be, v512si); 00750 pp1cBo = vec_add(pp1Bo, v512si); 00751 00752 pp32Ae = vec_sub(pp3Ae, pp2Ae); 00753 pp32Ao = vec_sub(pp3Ao, pp2Ao); 00754 pp32Be = vec_sub(pp3Be, pp2Be); 00755 pp32Bo = vec_sub(pp3Bo, pp2Bo); 00756 00757 sumAe = vec_add(pp1cAe, pp32Ae); 00758 sumAo = vec_add(pp1cAo, pp32Ao); 00759 sumBe = vec_add(pp1cBe, pp32Be); 00760 sumBo = vec_add(pp1cBo, pp32Bo); 00761 00762 ssumAe = vec_sra(sumAe, v10ui); 00763 ssumAo = vec_sra(sumAo, v10ui); 00764 ssumBe = vec_sra(sumBe, v10ui); 00765 ssumBo = vec_sra(sumBo, v10ui); 00766 00767 ssume = vec_packs(ssumAe, ssumBe); 00768 ssumo = vec_packs(ssumAo, ssumBo); 00769 00770 sumv = vec_packsu(ssume, ssumo); 00771 sum = vec_perm(sumv, sumv, mperm); 00772 00773 ASSERT_ALIGNED(dst); 00774 vdst = vec_ld(0, dst); 00775 00776 OP_U8_ALTIVEC(fsum, sum, vdst); 00777 00778 vec_st(fsum, 0, dst); 00779 00780 dst += dstStride; 00781 } 00782 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 00783 }