Libav
|
00001 /* 00002 * idct_mmx.c 00003 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> 00004 * 00005 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. 00006 * See http://libmpeg2.sourceforge.net/ for updates. 00007 * 00008 * mpeg2dec is free software; you can redistribute it and/or modify 00009 * it under the terms of the GNU General Public License as published by 00010 * the Free Software Foundation; either version 2 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * mpeg2dec is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU General Public License 00019 * along with mpeg2dec; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 #include "libavutil/common.h" 00024 #include "libavcodec/dsputil.h" 00025 00026 #include "dsputil_mmx.h" 00027 #include "mmx.h" 00028 00029 #define ROW_SHIFT 11 00030 #define COL_SHIFT 6 00031 00032 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) 00033 #define rounder(bias) {round (bias), round (bias)} 00034 00035 00036 #if 0 00037 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ 00038 static inline void idct_row (int16_t * row, int offset, 00039 int16_t * table, int32_t * rounder) 00040 { 00041 int C1, C2, C3, C4, C5, C6, C7; 00042 int a0, a1, a2, a3, b0, b1, b2, b3; 00043 00044 row += offset; 00045 00046 C1 = table[1]; 00047 C2 = table[2]; 00048 C3 = table[3]; 00049 C4 = table[4]; 00050 C5 = table[5]; 00051 C6 = table[6]; 00052 C7 = table[7]; 00053 00054 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; 00055 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; 00056 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; 00057 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; 00058 00059 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; 00060 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; 00061 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; 00062 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; 00063 00064 row[0] = (a0 + b0) >> ROW_SHIFT; 00065 row[1] = (a1 + b1) >> ROW_SHIFT; 00066 row[2] = (a2 + b2) >> ROW_SHIFT; 00067 row[3] = (a3 + b3) >> ROW_SHIFT; 00068 row[4] = (a3 - b3) >> ROW_SHIFT; 00069 row[5] = (a2 - b2) >> ROW_SHIFT; 00070 row[6] = (a1 - b1) >> ROW_SHIFT; 00071 row[7] = (a0 - b0) >> ROW_SHIFT; 00072 } 00073 #endif 00074 00075 00076 /* MMXEXT row IDCT */ 00077 00078 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ 00079 c4, c6, c4, c6, \ 00080 c1, c3, -c1, -c5, \ 00081 c5, c7, c3, -c7, \ 00082 c4, -c6, c4, -c6, \ 00083 -c4, c2, c4, -c2, \ 00084 c5, -c1, c3, -c1, \ 00085 c7, c3, c7, -c5 } 00086 00087 static inline void mmxext_row_head (int16_t * const row, const int offset, 00088 const int16_t * const table) 00089 { 00090 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 00091 00092 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 00093 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 00094 00095 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ 00096 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 00097 00098 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 00099 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ 00100 00101 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 00102 } 00103 00104 static inline void mmxext_row (const int16_t * const table, 00105 const int32_t * const rounder) 00106 { 00107 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ 00108 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ 00109 00110 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ 00111 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ 00112 00113 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ 00114 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ 00115 00116 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ 00117 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ 00118 00119 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ 00120 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ 00121 00122 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ 00123 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ 00124 00125 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ 00126 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ 00127 00128 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ 00129 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ 00130 00131 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ 00132 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ 00133 00134 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ 00135 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ 00136 00137 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ 00138 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ 00139 00140 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ 00141 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ 00142 } 00143 00144 static inline void mmxext_row_tail (int16_t * const row, const int store) 00145 { 00146 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 00147 00148 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ 00149 00150 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 00151 00152 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ 00153 00154 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 00155 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 00156 00157 /* slot */ 00158 00159 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 00160 } 00161 00162 static inline void mmxext_row_mid (int16_t * const row, const int store, 00163 const int offset, 00164 const int16_t * const table) 00165 { 00166 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 00167 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 00168 00169 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 00170 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ 00171 00172 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 00173 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 00174 00175 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ 00176 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 00177 00178 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 00179 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 00180 00181 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ 00182 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 00183 00184 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ 00185 00186 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 00187 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 00188 } 00189 00190 00191 /* MMX row IDCT */ 00192 00193 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ 00194 c4, c6, -c4, -c2, \ 00195 c1, c3, c3, -c7, \ 00196 c5, c7, -c1, -c5, \ 00197 c4, -c6, c4, -c2, \ 00198 -c4, c2, c4, -c6, \ 00199 c5, -c1, c7, -c5, \ 00200 c7, c3, c3, -c1 } 00201 00202 static inline void mmx_row_head (int16_t * const row, const int offset, 00203 const int16_t * const table) 00204 { 00205 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 00206 00207 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 00208 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 00209 00210 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ 00211 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 00212 00213 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ 00214 00215 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ 00216 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ 00217 00218 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ 00219 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ 00220 } 00221 00222 static inline void mmx_row (const int16_t * const table, 00223 const int32_t * const rounder) 00224 { 00225 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ 00226 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ 00227 00228 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ 00229 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ 00230 00231 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ 00232 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ 00233 00234 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ 00235 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ 00236 00237 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ 00238 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ 00239 00240 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ 00241 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ 00242 00243 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ 00244 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ 00245 00246 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ 00247 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ 00248 00249 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ 00250 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ 00251 00252 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ 00253 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ 00254 00255 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ 00256 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ 00257 00258 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ 00259 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ 00260 } 00261 00262 static inline void mmx_row_tail (int16_t * const row, const int store) 00263 { 00264 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 00265 00266 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ 00267 00268 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 00269 00270 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ 00271 00272 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 00273 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ 00274 00275 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ 00276 00277 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ 00278 00279 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ 00280 00281 /* slot */ 00282 00283 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 00284 } 00285 00286 static inline void mmx_row_mid (int16_t * const row, const int store, 00287 const int offset, const int16_t * const table) 00288 { 00289 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 00290 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 00291 00292 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 00293 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ 00294 00295 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 00296 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 00297 00298 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ 00299 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 00300 00301 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 00302 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ 00303 00304 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ 00305 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ 00306 00307 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ 00308 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ 00309 00310 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ 00311 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ 00312 00313 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ 00314 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ 00315 00316 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 00317 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ 00318 } 00319 00320 00321 #if 0 00322 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ 00323 static inline void idct_col (int16_t * col, int offset) 00324 { 00325 /* multiplication - as implemented on mmx */ 00326 #define F(c,x) (((c) * (x)) >> 16) 00327 00328 /* saturation - it helps us handle torture test cases */ 00329 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) 00330 00331 int16_t x0, x1, x2, x3, x4, x5, x6, x7; 00332 int16_t y0, y1, y2, y3, y4, y5, y6, y7; 00333 int16_t a0, a1, a2, a3, b0, b1, b2, b3; 00334 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; 00335 00336 col += offset; 00337 00338 x0 = col[0*8]; 00339 x1 = col[1*8]; 00340 x2 = col[2*8]; 00341 x3 = col[3*8]; 00342 x4 = col[4*8]; 00343 x5 = col[5*8]; 00344 x6 = col[6*8]; 00345 x7 = col[7*8]; 00346 00347 u04 = S (x0 + x4); 00348 v04 = S (x0 - x4); 00349 u26 = S (F (T2, x6) + x2); 00350 v26 = S (F (T2, x2) - x6); 00351 00352 a0 = S (u04 + u26); 00353 a1 = S (v04 + v26); 00354 a2 = S (v04 - v26); 00355 a3 = S (u04 - u26); 00356 00357 u17 = S (F (T1, x7) + x1); 00358 v17 = S (F (T1, x1) - x7); 00359 u35 = S (F (T3, x5) + x3); 00360 v35 = S (F (T3, x3) - x5); 00361 00362 b0 = S (u17 + u35); 00363 b3 = S (v17 - v35); 00364 u12 = S (u17 - u35); 00365 v12 = S (v17 + v35); 00366 u12 = S (2 * F (C4, u12)); 00367 v12 = S (2 * F (C4, v12)); 00368 b1 = S (u12 + v12); 00369 b2 = S (u12 - v12); 00370 00371 y0 = S (a0 + b0) >> COL_SHIFT; 00372 y1 = S (a1 + b1) >> COL_SHIFT; 00373 y2 = S (a2 + b2) >> COL_SHIFT; 00374 y3 = S (a3 + b3) >> COL_SHIFT; 00375 00376 y4 = S (a3 - b3) >> COL_SHIFT; 00377 y5 = S (a2 - b2) >> COL_SHIFT; 00378 y6 = S (a1 - b1) >> COL_SHIFT; 00379 y7 = S (a0 - b0) >> COL_SHIFT; 00380 00381 col[0*8] = y0; 00382 col[1*8] = y1; 00383 col[2*8] = y2; 00384 col[3*8] = y3; 00385 col[4*8] = y4; 00386 col[5*8] = y5; 00387 col[6*8] = y6; 00388 col[7*8] = y7; 00389 } 00390 #endif 00391 00392 00393 /* MMX column IDCT */ 00394 static inline void idct_col (int16_t * const col, const int offset) 00395 { 00396 #define T1 13036 00397 #define T2 27146 00398 #define T3 43790 00399 #define C4 23170 00400 00401 DECLARE_ALIGNED(8, static const short, t1_vector)[] = {T1,T1,T1,T1}; 00402 DECLARE_ALIGNED(8, static const short, t2_vector)[] = {T2,T2,T2,T2}; 00403 DECLARE_ALIGNED(8, static const short, t3_vector)[] = {T3,T3,T3,T3}; 00404 DECLARE_ALIGNED(8, static const short, c4_vector)[] = {C4,C4,C4,C4}; 00405 00406 /* column code adapted from Peter Gubanov */ 00407 /* http://www.elecard.com/peter/idct.shtml */ 00408 00409 movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ 00410 00411 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ 00412 movq_r2r (mm0, mm2); /* mm2 = T1 */ 00413 00414 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ 00415 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ 00416 00417 movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ 00418 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ 00419 00420 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ 00421 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ 00422 00423 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ 00424 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ 00425 00426 movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ 00427 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ 00428 00429 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ 00430 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ 00431 00432 /* slot */ 00433 00434 movq_r2r (mm4, mm2); /* mm2 = T2 */ 00435 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ 00436 00437 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ 00438 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ 00439 00440 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ 00441 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ 00442 00443 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ 00444 movq_r2r (mm0, mm6); /* mm6 = v17 */ 00445 00446 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ 00447 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ 00448 00449 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ 00450 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ 00451 00452 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ 00453 movq_r2r (mm1, mm6); /* mm6 = u17 */ 00454 00455 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ 00456 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ 00457 00458 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ 00459 movq_r2r (mm1, mm7); /* mm7 = u12 */ 00460 00461 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ 00462 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ 00463 00464 movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ 00465 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ 00466 00467 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ 00468 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ 00469 00470 movq_r2r (mm4, mm6); /* mm6 = v26 */ 00471 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ 00472 00473 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ 00474 movq_r2r (mm3, mm0); /* mm0 = x0 */ 00475 00476 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ 00477 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ 00478 00479 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ 00480 movq_r2r (mm0, mm5); /* mm5 = u04 */ 00481 00482 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ 00483 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ 00484 00485 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ 00486 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ 00487 00488 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ 00489 movq_r2r (mm3, mm2); /* mm2 = a2 */ 00490 00491 movq_r2r (mm4, mm6); /* mm6 = a1 */ 00492 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ 00493 00494 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ 00495 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ 00496 00497 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ 00498 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ 00499 00500 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ 00501 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ 00502 00503 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ 00504 movq_r2r (mm5, mm7); /* mm7 = a0 */ 00505 00506 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ 00507 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ 00508 00509 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ 00510 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ 00511 00512 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ 00513 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ 00514 00515 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ 00516 movq_r2r (mm0, mm3); /* mm3 = a3 */ 00517 00518 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ 00519 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ 00520 00521 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ 00522 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ 00523 00524 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ 00525 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ 00526 00527 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ 00528 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ 00529 00530 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ 00531 00532 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ 00533 00534 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ 00535 00536 #undef T1 00537 #undef T2 00538 #undef T3 00539 #undef C4 00540 } 00541 00542 00543 DECLARE_ALIGNED(8, static const int32_t, rounder0)[] = 00544 rounder ((1 << (COL_SHIFT - 1)) - 0.5); 00545 DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0); 00546 DECLARE_ALIGNED(8, static const int32_t, rounder1)[] = 00547 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ 00548 DECLARE_ALIGNED(8, static const int32_t, rounder7)[] = 00549 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ 00550 DECLARE_ALIGNED(8, static const int32_t, rounder2)[] = 00551 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ 00552 DECLARE_ALIGNED(8, static const int32_t, rounder6)[] = 00553 rounder (-0.25); /* C2 * (C6-C2)/2 */ 00554 DECLARE_ALIGNED(8, static const int32_t, rounder3)[] = 00555 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ 00556 DECLARE_ALIGNED(8, static const int32_t, rounder5)[] = 00557 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ 00558 00559 #undef COL_SHIFT 00560 #undef ROW_SHIFT 00561 00562 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ 00563 void idct (int16_t * const block) \ 00564 { \ 00565 DECLARE_ALIGNED(16, static const int16_t, table04)[] = \ 00566 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ 00567 DECLARE_ALIGNED(16, static const int16_t, table17)[] = \ 00568 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ 00569 DECLARE_ALIGNED(16, static const int16_t, table26)[] = \ 00570 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ 00571 DECLARE_ALIGNED(16, static const int16_t, table35)[] = \ 00572 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ 00573 \ 00574 idct_row_head (block, 0*8, table04); \ 00575 idct_row (table04, rounder0); \ 00576 idct_row_mid (block, 0*8, 4*8, table04); \ 00577 idct_row (table04, rounder4); \ 00578 idct_row_mid (block, 4*8, 1*8, table17); \ 00579 idct_row (table17, rounder1); \ 00580 idct_row_mid (block, 1*8, 7*8, table17); \ 00581 idct_row (table17, rounder7); \ 00582 idct_row_mid (block, 7*8, 2*8, table26); \ 00583 idct_row (table26, rounder2); \ 00584 idct_row_mid (block, 2*8, 6*8, table26); \ 00585 idct_row (table26, rounder6); \ 00586 idct_row_mid (block, 6*8, 3*8, table35); \ 00587 idct_row (table35, rounder3); \ 00588 idct_row_mid (block, 3*8, 5*8, table35); \ 00589 idct_row (table35, rounder5); \ 00590 idct_row_tail (block, 5*8); \ 00591 \ 00592 idct_col (block, 0); \ 00593 idct_col (block, 4); \ 00594 } 00595 00596 declare_idct (ff_mmxext_idct, mmxext_table, 00597 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) 00598 00599 declare_idct (ff_mmx_idct, mmx_table, 00600 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) 00601