Libav
|
00001 /* 00002 * idct for sh4 00003 * 00004 * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp> 00005 * 00006 * This file is part of FFmpeg. 00007 * 00008 * FFmpeg is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * FFmpeg is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with FFmpeg; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 #include "libavcodec/dsputil.h" 00024 #include "dsputil_sh4.h" 00025 #include "sh4.h" 00026 00027 #define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */ 00028 #define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */ 00029 #define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */ 00030 #define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */ 00031 #define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */ 00032 #define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */ 00033 #define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */ 00034 00035 static const float even_table[] __attribute__ ((aligned(8))) = { 00036 c4, c4, c4, c4, 00037 c2, c6,-c6,-c2, 00038 c4,-c4,-c4, c4, 00039 c6,-c2, c2,-c6 00040 }; 00041 00042 static const float odd_table[] __attribute__ ((aligned(8))) = { 00043 c1, c3, c5, c7, 00044 c3,-c7,-c1,-c5, 00045 c5,-c1, c7, c3, 00046 c7,-c5, c3,-c1 00047 }; 00048 00049 #undef c1 00050 #undef c2 00051 #undef c3 00052 #undef c4 00053 #undef c5 00054 #undef c6 00055 #undef c7 00056 00057 #if 1 00058 00059 #define load_matrix(table) \ 00060 do { \ 00061 const float *t = table; \ 00062 __asm__ volatile( \ 00063 " fschg\n" \ 00064 " fmov @%0+,xd0\n" \ 00065 " fmov @%0+,xd2\n" \ 00066 " fmov @%0+,xd4\n" \ 00067 " fmov @%0+,xd6\n" \ 00068 " fmov @%0+,xd8\n" \ 00069 " fmov @%0+,xd10\n" \ 00070 " fmov @%0+,xd12\n" \ 00071 " fmov @%0+,xd14\n" \ 00072 " fschg\n" \ 00073 : "+r"(t) \ 00074 ); \ 00075 } while (0) 00076 00077 #define ftrv() \ 00078 __asm__ volatile("ftrv xmtrx,fv0" \ 00079 : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3)); 00080 00081 #define DEFREG \ 00082 register float fr0 __asm__("fr0"); \ 00083 register float fr1 __asm__("fr1"); \ 00084 register float fr2 __asm__("fr2"); \ 00085 register float fr3 __asm__("fr3") 00086 00087 #else 00088 00089 /* generic C code for check */ 00090 00091 static void ftrv_(const float xf[],float fv[]) 00092 { 00093 float f0,f1,f2,f3; 00094 f0 = fv[0]; 00095 f1 = fv[1]; 00096 f2 = fv[2]; 00097 f3 = fv[3]; 00098 fv[0] = xf[0]*f0 + xf[4]*f1 + xf[ 8]*f2 + xf[12]*f3; 00099 fv[1] = xf[1]*f0 + xf[5]*f1 + xf[ 9]*f2 + xf[13]*f3; 00100 fv[2] = xf[2]*f0 + xf[6]*f1 + xf[10]*f2 + xf[14]*f3; 00101 fv[3] = xf[3]*f0 + xf[7]*f1 + xf[11]*f2 + xf[15]*f3; 00102 } 00103 00104 static void load_matrix_(float xf[],const float table[]) 00105 { 00106 int i; 00107 for(i=0;i<16;i++) xf[i]=table[i]; 00108 } 00109 00110 #define ftrv() ftrv_(xf,fv) 00111 #define load_matrix(table) load_matrix_(xf,table) 00112 00113 #define DEFREG \ 00114 float fv[4],xf[16] 00115 00116 #define fr0 fv[0] 00117 #define fr1 fv[1] 00118 #define fr2 fv[2] 00119 #define fr3 fv[3] 00120 00121 #endif 00122 00123 #if 1 00124 #define DESCALE(x,n) (x)*(1.0f/(1<<(n))) 00125 #else 00126 #define DESCALE(x,n) (((int)(x)+(1<<(n-1)))>>(n)) 00127 #endif 00128 00129 /* this code work worse on gcc cvs. 3.2.3 work fine */ 00130 00131 00132 #if 1 00133 //optimized 00134 00135 void idct_sh4(DCTELEM *block) 00136 { 00137 DEFREG; 00138 00139 int i; 00140 float tblock[8*8],*fblock; 00141 int ofs1,ofs2,ofs3; 00142 int fpscr; 00143 00144 fp_single_enter(fpscr); 00145 00146 /* row */ 00147 00148 /* even part */ 00149 load_matrix(even_table); 00150 00151 fblock = tblock+4; 00152 i = 8; 00153 do { 00154 fr0 = block[0]; 00155 fr1 = block[2]; 00156 fr2 = block[4]; 00157 fr3 = block[6]; 00158 block+=8; 00159 ftrv(); 00160 *--fblock = fr3; 00161 *--fblock = fr2; 00162 *--fblock = fr1; 00163 *--fblock = fr0; 00164 fblock+=8+4; 00165 } while(--i); 00166 block-=8*8; 00167 fblock-=8*8+4; 00168 00169 load_matrix(odd_table); 00170 00171 i = 8; 00172 00173 do { 00174 float t0,t1,t2,t3; 00175 fr0 = block[1]; 00176 fr1 = block[3]; 00177 fr2 = block[5]; 00178 fr3 = block[7]; 00179 block+=8; 00180 ftrv(); 00181 t0 = *fblock++; 00182 t1 = *fblock++; 00183 t2 = *fblock++; 00184 t3 = *fblock++; 00185 fblock+=4; 00186 *--fblock = t0 - fr0; 00187 *--fblock = t1 - fr1; 00188 *--fblock = t2 - fr2; 00189 *--fblock = t3 - fr3; 00190 *--fblock = t3 + fr3; 00191 *--fblock = t2 + fr2; 00192 *--fblock = t1 + fr1; 00193 *--fblock = t0 + fr0; 00194 fblock+=8; 00195 } while(--i); 00196 block-=8*8; 00197 fblock-=8*8; 00198 00199 /* col */ 00200 00201 /* even part */ 00202 load_matrix(even_table); 00203 00204 ofs1 = sizeof(float)*2*8; 00205 ofs2 = sizeof(float)*4*8; 00206 ofs3 = sizeof(float)*6*8; 00207 00208 i = 8; 00209 00210 #define OA(fblock,ofs) *(float*)((char*)fblock + ofs) 00211 00212 do { 00213 fr0 = OA(fblock, 0); 00214 fr1 = OA(fblock,ofs1); 00215 fr2 = OA(fblock,ofs2); 00216 fr3 = OA(fblock,ofs3); 00217 ftrv(); 00218 OA(fblock,0 ) = fr0; 00219 OA(fblock,ofs1) = fr1; 00220 OA(fblock,ofs2) = fr2; 00221 OA(fblock,ofs3) = fr3; 00222 fblock++; 00223 } while(--i); 00224 fblock-=8; 00225 00226 load_matrix(odd_table); 00227 00228 i=8; 00229 do { 00230 float t0,t1,t2,t3; 00231 t0 = OA(fblock, 0); /* [8*0] */ 00232 t1 = OA(fblock,ofs1); /* [8*2] */ 00233 t2 = OA(fblock,ofs2); /* [8*4] */ 00234 t3 = OA(fblock,ofs3); /* [8*6] */ 00235 fblock+=8; 00236 fr0 = OA(fblock, 0); /* [8*1] */ 00237 fr1 = OA(fblock,ofs1); /* [8*3] */ 00238 fr2 = OA(fblock,ofs2); /* [8*5] */ 00239 fr3 = OA(fblock,ofs3); /* [8*7] */ 00240 fblock+=-8+1; 00241 ftrv(); 00242 block[8*0] = DESCALE(t0 + fr0,3); 00243 block[8*7] = DESCALE(t0 - fr0,3); 00244 block[8*1] = DESCALE(t1 + fr1,3); 00245 block[8*6] = DESCALE(t1 - fr1,3); 00246 block[8*2] = DESCALE(t2 + fr2,3); 00247 block[8*5] = DESCALE(t2 - fr2,3); 00248 block[8*3] = DESCALE(t3 + fr3,3); 00249 block[8*4] = DESCALE(t3 - fr3,3); 00250 block++; 00251 } while(--i); 00252 00253 fp_single_leave(fpscr); 00254 } 00255 #else 00256 void idct_sh4(DCTELEM *block) 00257 { 00258 DEFREG; 00259 00260 int i; 00261 float tblock[8*8],*fblock; 00262 00263 /* row */ 00264 00265 /* even part */ 00266 load_matrix(even_table); 00267 00268 fblock = tblock; 00269 i = 8; 00270 do { 00271 fr0 = block[0]; 00272 fr1 = block[2]; 00273 fr2 = block[4]; 00274 fr3 = block[6]; 00275 block+=8; 00276 ftrv(); 00277 fblock[0] = fr0; 00278 fblock[2] = fr1; 00279 fblock[4] = fr2; 00280 fblock[6] = fr3; 00281 fblock+=8; 00282 } while(--i); 00283 block-=8*8; 00284 fblock-=8*8; 00285 00286 load_matrix(odd_table); 00287 00288 i = 8; 00289 00290 do { 00291 float t0,t1,t2,t3; 00292 fr0 = block[1]; 00293 fr1 = block[3]; 00294 fr2 = block[5]; 00295 fr3 = block[7]; 00296 block+=8; 00297 ftrv(); 00298 t0 = fblock[0]; 00299 t1 = fblock[2]; 00300 t2 = fblock[4]; 00301 t3 = fblock[6]; 00302 fblock[0] = t0 + fr0; 00303 fblock[7] = t0 - fr0; 00304 fblock[1] = t1 + fr1; 00305 fblock[6] = t1 - fr1; 00306 fblock[2] = t2 + fr2; 00307 fblock[5] = t2 - fr2; 00308 fblock[3] = t3 + fr3; 00309 fblock[4] = t3 - fr3; 00310 fblock+=8; 00311 } while(--i); 00312 block-=8*8; 00313 fblock-=8*8; 00314 00315 /* col */ 00316 00317 /* even part */ 00318 load_matrix(even_table); 00319 00320 i = 8; 00321 00322 do { 00323 fr0 = fblock[8*0]; 00324 fr1 = fblock[8*2]; 00325 fr2 = fblock[8*4]; 00326 fr3 = fblock[8*6]; 00327 ftrv(); 00328 fblock[8*0] = fr0; 00329 fblock[8*2] = fr1; 00330 fblock[8*4] = fr2; 00331 fblock[8*6] = fr3; 00332 fblock++; 00333 } while(--i); 00334 fblock-=8; 00335 00336 load_matrix(odd_table); 00337 00338 i=8; 00339 do { 00340 float t0,t1,t2,t3; 00341 fr0 = fblock[8*1]; 00342 fr1 = fblock[8*3]; 00343 fr2 = fblock[8*5]; 00344 fr3 = fblock[8*7]; 00345 ftrv(); 00346 t0 = fblock[8*0]; 00347 t1 = fblock[8*2]; 00348 t2 = fblock[8*4]; 00349 t3 = fblock[8*6]; 00350 fblock++; 00351 block[8*0] = DESCALE(t0 + fr0,3); 00352 block[8*7] = DESCALE(t0 - fr0,3); 00353 block[8*1] = DESCALE(t1 + fr1,3); 00354 block[8*6] = DESCALE(t1 - fr1,3); 00355 block[8*2] = DESCALE(t2 + fr2,3); 00356 block[8*5] = DESCALE(t2 - fr2,3); 00357 block[8*3] = DESCALE(t3 + fr3,3); 00358 block[8*4] = DESCALE(t3 - fr3,3); 00359 block++; 00360 } while(--i); 00361 } 00362 #endif