Libav 0.7.1
|
00001 /* 00002 * idct_mmx.c 00003 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> 00004 * 00005 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. 00006 * See http://libmpeg2.sourceforge.net/ for updates. 00007 * 00008 * mpeg2dec is free software; you can redistribute it and/or modify 00009 * it under the terms of the GNU General Public License as published by 00010 * the Free Software Foundation; either version 2 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * mpeg2dec is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU General Public License 00019 * along with mpeg2dec; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 #include "libavutil/common.h" 00024 #include "libavcodec/dsputil.h" 00025 00026 #include "libavutil/x86_cpu.h" 00027 #include "dsputil_mmx.h" 00028 00029 #define ROW_SHIFT 11 00030 #define COL_SHIFT 6 00031 00032 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) 00033 #define rounder(bias) {round (bias), round (bias)} 00034 00035 00036 #if 0 00037 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ 00038 static inline void idct_row (int16_t * row, int offset, 00039 int16_t * table, int32_t * rounder) 00040 { 00041 int C1, C2, C3, C4, C5, C6, C7; 00042 int a0, a1, a2, a3, b0, b1, b2, b3; 00043 00044 row += offset; 00045 00046 C1 = table[1]; 00047 C2 = table[2]; 00048 C3 = table[3]; 00049 C4 = table[4]; 00050 C5 = table[5]; 00051 C6 = table[6]; 00052 C7 = table[7]; 00053 00054 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; 00055 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; 00056 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; 00057 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; 00058 00059 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; 00060 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; 00061 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; 00062 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; 00063 00064 row[0] = (a0 + b0) >> ROW_SHIFT; 00065 row[1] = (a1 + b1) >> ROW_SHIFT; 00066 row[2] = (a2 + b2) >> ROW_SHIFT; 00067 row[3] = (a3 + b3) >> ROW_SHIFT; 00068 row[4] = (a3 - b3) >> ROW_SHIFT; 00069 row[5] = (a2 - b2) >> ROW_SHIFT; 00070 row[6] = (a1 - b1) >> ROW_SHIFT; 00071 row[7] = (a0 - b0) >> ROW_SHIFT; 00072 } 00073 #endif 00074 00075 00076 /* MMXEXT row IDCT */ 00077 00078 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ 00079 c4, c6, c4, c6, \ 00080 c1, c3, -c1, -c5, \ 00081 c5, c7, c3, -c7, \ 00082 c4, -c6, c4, -c6, \ 00083 -c4, c2, c4, -c2, \ 00084 c5, -c1, c3, -c1, \ 00085 c7, c3, c7, -c5 } 00086 00087 static inline void mmxext_row_head (int16_t * const row, const int offset, 00088 const int16_t * const table) 00089 { 00090 __asm__ volatile( 00091 "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ 00092 00093 "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ 00094 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ 00095 00096 "movq (%1), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ 00097 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ 00098 00099 "movq 8(%1), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ 00100 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ 00101 00102 "pshufw $0x4e, %%mm2, %%mm2 \n\t" /* mm2 = x2 x0 x6 x4 */ 00103 :: "r" ((row+offset)), "r" (table) 00104 ); 00105 } 00106 00107 static inline void mmxext_row (const int16_t * const table, 00108 const int32_t * const rounder) 00109 { 00110 __asm__ volatile ( 00111 "movq 16(%0), %%mm1 \n\t" /* mm1 = -C5 -C1 C3 C1 */ 00112 "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ 00113 00114 "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ 00115 "pshufw $0x4e, %%mm6, %%mm6 \n\t" /* mm6 = x3 x1 x7 x5 */ 00116 00117 "movq 24(%0), %%mm7 \n\t" /* mm7 = -C7 C3 C7 C5 */ 00118 "pmaddwd %%mm5, %%mm1 \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */ 00119 00120 "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ 00121 "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ 00122 00123 "pmaddwd 40(%0), %%mm2 \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */ 00124 "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ 00125 00126 "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ 00127 "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ 00128 00129 "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ 00130 "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ 00131 00132 "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ 00133 "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ 00134 00135 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ 00136 "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ 00137 00138 "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ 00139 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ 00140 00141 "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ 00142 "movq %%mm0, %%mm4 \n\t" /* mm4 = a3 a2 + rounder */ 00143 00144 "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ 00145 "psubd %%mm5, %%mm4 \n\t" /* mm4 = a3-b3 a2-b2 + rounder */ 00146 : : "r" (table), "r" (rounder)); 00147 } 00148 00149 static inline void mmxext_row_tail (int16_t * const row, const int store) 00150 { 00151 __asm__ volatile ( 00152 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ 00153 00154 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ 00155 00156 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ 00157 00158 "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ 00159 00160 "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ 00161 "pshufw $0xb1, %%mm4, %%mm4 \n\t" /* mm4 = y7 y6 y5 y4 */ 00162 00163 /* slot */ 00164 00165 "movq %%mm4, 8(%0) \n\t" /* save y7 y6 y5 y4 */ 00166 :: "r" (row+store) 00167 ); 00168 } 00169 00170 static inline void mmxext_row_mid (int16_t * const row, const int store, 00171 const int offset, 00172 const int16_t * const table) 00173 { 00174 __asm__ volatile ( 00175 "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ 00176 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ 00177 00178 "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ 00179 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ 00180 00181 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ 00182 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ 00183 00184 "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ 00185 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ 00186 00187 "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ 00188 "pshufw $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */ 00189 00190 "movq (%3), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ 00191 "movq %%mm4, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ 00192 00193 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */ 00194 00195 "movq 8(%3), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ 00196 "pshufw $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */ 00197 :: "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) 00198 ); 00199 } 00200 00201 00202 /* MMX row IDCT */ 00203 00204 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ 00205 c4, c6, -c4, -c2, \ 00206 c1, c3, c3, -c7, \ 00207 c5, c7, -c1, -c5, \ 00208 c4, -c6, c4, -c2, \ 00209 -c4, c2, c4, -c6, \ 00210 c5, -c1, c7, -c5, \ 00211 c7, c3, c3, -c1 } 00212 00213 static inline void mmx_row_head (int16_t * const row, const int offset, 00214 const int16_t * const table) 00215 { 00216 __asm__ volatile ( 00217 "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ 00218 00219 "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ 00220 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ 00221 00222 "movq (%1), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ 00223 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ 00224 00225 "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ 00226 00227 "movq 8(%1), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ 00228 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ 00229 00230 "movq 16(%1), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ 00231 "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ 00232 :: "r" ((row+offset)), "r" (table) 00233 ); 00234 } 00235 00236 static inline void mmx_row (const int16_t * const table, 00237 const int32_t * const rounder) 00238 { 00239 __asm__ volatile ( 00240 "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ 00241 "punpckldq %%mm5, %%mm5 \n\t" /* mm5 = x3 x1 x3 x1 */ 00242 00243 "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ 00244 "punpckhdq %%mm6, %%mm6 \n\t" /* mm6 = x7 x5 x7 x5 */ 00245 00246 "movq 24(%0), %%mm7 \n\t" /* mm7 = -C5 -C1 C7 C5 */ 00247 "pmaddwd %%mm5, %%mm1 \n\t" /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ 00248 00249 "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ 00250 "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ 00251 00252 "pmaddwd 40(%0), %%mm2 \n\t" /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ 00253 "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ 00254 00255 "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ 00256 "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ 00257 00258 "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ 00259 "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ 00260 00261 "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ 00262 "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ 00263 00264 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ 00265 "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ 00266 00267 "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ 00268 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ 00269 00270 "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ 00271 "movq %%mm0, %%mm7 \n\t" /* mm7 = a3 a2 + rounder */ 00272 00273 "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ 00274 "psubd %%mm5, %%mm7 \n\t" /* mm7 = a3-b3 a2-b2 + rounder */ 00275 :: "r" (table), "r" (rounder) 00276 ); 00277 } 00278 00279 static inline void mmx_row_tail (int16_t * const row, const int store) 00280 { 00281 __asm__ volatile ( 00282 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ 00283 00284 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ 00285 00286 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ 00287 00288 "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ 00289 00290 "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ 00291 "movq %%mm7, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ 00292 00293 "pslld $16, %%mm7 \n\t" /* mm7 = y7 0 y5 0 */ 00294 00295 "psrld $16, %%mm4 \n\t" /* mm4 = 0 y6 0 y4 */ 00296 00297 "por %%mm4, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ 00298 00299 /* slot */ 00300 00301 "movq %%mm7, 8(%0) \n\t" /* save y7 y6 y5 y4 */ 00302 :: "r" (row+store) 00303 ); 00304 } 00305 00306 static inline void mmx_row_mid (int16_t * const row, const int store, 00307 const int offset, const int16_t * const table) 00308 { 00309 00310 __asm__ volatile ( 00311 "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ 00312 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ 00313 00314 "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ 00315 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ 00316 00317 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ 00318 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ 00319 00320 "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ 00321 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ 00322 00323 "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ 00324 "movq %%mm7, %%mm1 \n\t" /* mm1 = y6 y7 y4 y5 */ 00325 00326 "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ 00327 "psrld $16, %%mm7 \n\t" /* mm7 = 0 y6 0 y4 */ 00328 00329 "movq (%3), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ 00330 "pslld $16, %%mm1 \n\t" /* mm1 = y7 0 y5 0 */ 00331 00332 "movq 8(%3), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ 00333 "por %%mm1, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ 00334 00335 "movq 16(%3), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ 00336 "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ 00337 00338 "movq %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ 00339 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ 00340 : : "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) 00341 ); 00342 } 00343 00344 00345 #if 0 00346 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ 00347 static inline void idct_col (int16_t * col, int offset) 00348 { 00349 /* multiplication - as implemented on mmx */ 00350 #define F(c,x) (((c) * (x)) >> 16) 00351 00352 /* saturation - it helps us handle torture test cases */ 00353 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) 00354 00355 int16_t x0, x1, x2, x3, x4, x5, x6, x7; 00356 int16_t y0, y1, y2, y3, y4, y5, y6, y7; 00357 int16_t a0, a1, a2, a3, b0, b1, b2, b3; 00358 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; 00359 00360 col += offset; 00361 00362 x0 = col[0*8]; 00363 x1 = col[1*8]; 00364 x2 = col[2*8]; 00365 x3 = col[3*8]; 00366 x4 = col[4*8]; 00367 x5 = col[5*8]; 00368 x6 = col[6*8]; 00369 x7 = col[7*8]; 00370 00371 u04 = S (x0 + x4); 00372 v04 = S (x0 - x4); 00373 u26 = S (F (T2, x6) + x2); 00374 v26 = S (F (T2, x2) - x6); 00375 00376 a0 = S (u04 + u26); 00377 a1 = S (v04 + v26); 00378 a2 = S (v04 - v26); 00379 a3 = S (u04 - u26); 00380 00381 u17 = S (F (T1, x7) + x1); 00382 v17 = S (F (T1, x1) - x7); 00383 u35 = S (F (T3, x5) + x3); 00384 v35 = S (F (T3, x3) - x5); 00385 00386 b0 = S (u17 + u35); 00387 b3 = S (v17 - v35); 00388 u12 = S (u17 - u35); 00389 v12 = S (v17 + v35); 00390 u12 = S (2 * F (C4, u12)); 00391 v12 = S (2 * F (C4, v12)); 00392 b1 = S (u12 + v12); 00393 b2 = S (u12 - v12); 00394 00395 y0 = S (a0 + b0) >> COL_SHIFT; 00396 y1 = S (a1 + b1) >> COL_SHIFT; 00397 y2 = S (a2 + b2) >> COL_SHIFT; 00398 y3 = S (a3 + b3) >> COL_SHIFT; 00399 00400 y4 = S (a3 - b3) >> COL_SHIFT; 00401 y5 = S (a2 - b2) >> COL_SHIFT; 00402 y6 = S (a1 - b1) >> COL_SHIFT; 00403 y7 = S (a0 - b0) >> COL_SHIFT; 00404 00405 col[0*8] = y0; 00406 col[1*8] = y1; 00407 col[2*8] = y2; 00408 col[3*8] = y3; 00409 col[4*8] = y4; 00410 col[5*8] = y5; 00411 col[6*8] = y6; 00412 col[7*8] = y7; 00413 } 00414 #endif 00415 00416 00417 /* MMX column IDCT */ 00418 static inline void idct_col (int16_t * const col, const int offset) 00419 { 00420 #define T1 13036 00421 #define T2 27146 00422 #define T3 43790 00423 #define C4 23170 00424 00425 DECLARE_ALIGNED(8, static const short, t1_vector)[] = { 00426 T1,T1,T1,T1, 00427 T2,T2,T2,T2, 00428 T3,T3,T3,T3, 00429 C4,C4,C4,C4 00430 }; 00431 00432 /* column code adapted from Peter Gubanov */ 00433 /* http://www.elecard.com/peter/idct.shtml */ 00434 00435 __asm__ volatile ( 00436 "movq (%0), %%mm0 \n\t" /* mm0 = T1 */ 00437 00438 "movq 2*8(%1), %%mm1 \n\t" /* mm1 = x1 */ 00439 "movq %%mm0, %%mm2 \n\t" /* mm2 = T1 */ 00440 00441 "movq 7*2*8(%1), %%mm4 \n\t" /* mm4 = x7 */ 00442 "pmulhw %%mm1, %%mm0 \n\t" /* mm0 = T1*x1 */ 00443 00444 "movq 16(%0), %%mm5 \n\t" /* mm5 = T3 */ 00445 "pmulhw %%mm4, %%mm2 \n\t" /* mm2 = T1*x7 */ 00446 00447 "movq 2*5*8(%1), %%mm6 \n\t" /* mm6 = x5 */ 00448 "movq %%mm5, %%mm7 \n\t" /* mm7 = T3-1 */ 00449 00450 "movq 3*8*2(%1), %%mm3 \n\t" /* mm3 = x3 */ 00451 "psubsw %%mm4, %%mm0 \n\t" /* mm0 = v17 */ 00452 00453 "movq 8(%0), %%mm4 \n\t" /* mm4 = T2 */ 00454 "pmulhw %%mm3, %%mm5 \n\t" /* mm5 = (T3-1)*x3 */ 00455 00456 "paddsw %%mm2, %%mm1 \n\t" /* mm1 = u17 */ 00457 "pmulhw %%mm6, %%mm7 \n\t" /* mm7 = (T3-1)*x5 */ 00458 00459 /* slot */ 00460 00461 "movq %%mm4, %%mm2 \n\t" /* mm2 = T2 */ 00462 "paddsw %%mm3, %%mm5 \n\t" /* mm5 = T3*x3 */ 00463 00464 "pmulhw 2*8*2(%1), %%mm4 \n\t" /* mm4 = T2*x2 */ 00465 "paddsw %%mm6, %%mm7 \n\t" /* mm7 = T3*x5 */ 00466 00467 "psubsw %%mm6, %%mm5 \n\t" /* mm5 = v35 */ 00468 "paddsw %%mm3, %%mm7 \n\t" /* mm7 = u35 */ 00469 00470 "movq 6*8*2(%1), %%mm3 \n\t" /* mm3 = x6 */ 00471 "movq %%mm0, %%mm6 \n\t" /* mm6 = v17 */ 00472 00473 "pmulhw %%mm3, %%mm2 \n\t" /* mm2 = T2*x6 */ 00474 "psubsw %%mm5, %%mm0 \n\t" /* mm0 = b3 */ 00475 00476 "psubsw %%mm3, %%mm4 \n\t" /* mm4 = v26 */ 00477 "paddsw %%mm6, %%mm5 \n\t" /* mm5 = v12 */ 00478 00479 "movq %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */ 00480 "movq %%mm1, %%mm6 \n\t" /* mm6 = u17 */ 00481 00482 "paddsw 2*8*2(%1), %%mm2 \n\t" /* mm2 = u26 */ 00483 "paddsw %%mm7, %%mm6 \n\t" /* mm6 = b0 */ 00484 00485 "psubsw %%mm7, %%mm1 \n\t" /* mm1 = u12 */ 00486 "movq %%mm1, %%mm7 \n\t" /* mm7 = u12 */ 00487 00488 "movq 0*8(%1), %%mm3 \n\t" /* mm3 = x0 */ 00489 "paddsw %%mm5, %%mm1 \n\t" /* mm1 = u12+v12 */ 00490 00491 "movq 24(%0), %%mm0 \n\t" /* mm0 = C4/2 */ 00492 "psubsw %%mm5, %%mm7 \n\t" /* mm7 = u12-v12 */ 00493 00494 "movq %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */ 00495 "pmulhw %%mm0, %%mm1 \n\t" /* mm1 = b1/2 */ 00496 00497 "movq %%mm4, %%mm6 \n\t" /* mm6 = v26 */ 00498 "pmulhw %%mm0, %%mm7 \n\t" /* mm7 = b2/2 */ 00499 00500 "movq 4*8*2(%1), %%mm5 \n\t" /* mm5 = x4 */ 00501 "movq %%mm3, %%mm0 \n\t" /* mm0 = x0 */ 00502 00503 "psubsw %%mm5, %%mm3 \n\t" /* mm3 = v04 */ 00504 "paddsw %%mm5, %%mm0 \n\t" /* mm0 = u04 */ 00505 00506 "paddsw %%mm3, %%mm4 \n\t" /* mm4 = a1 */ 00507 "movq %%mm0, %%mm5 \n\t" /* mm5 = u04 */ 00508 00509 "psubsw %%mm6, %%mm3 \n\t" /* mm3 = a2 */ 00510 "paddsw %%mm2, %%mm5 \n\t" /* mm5 = a0 */ 00511 00512 "paddsw %%mm1, %%mm1 \n\t" /* mm1 = b1 */ 00513 "psubsw %%mm2, %%mm0 \n\t" /* mm0 = a3 */ 00514 00515 "paddsw %%mm7, %%mm7 \n\t" /* mm7 = b2 */ 00516 "movq %%mm3, %%mm2 \n\t" /* mm2 = a2 */ 00517 00518 "movq %%mm4, %%mm6 \n\t" /* mm6 = a1 */ 00519 "paddsw %%mm7, %%mm3 \n\t" /* mm3 = a2+b2 */ 00520 00521 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */ 00522 "paddsw %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */ 00523 00524 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */ 00525 "psubsw %%mm1, %%mm6 \n\t" /* mm6 = a1-b1 */ 00526 00527 "movq 5*8*2(%1), %%mm1 \n\t" /* mm1 = b0 */ 00528 "psubsw %%mm7, %%mm2 \n\t" /* mm2 = a2-b2 */ 00529 00530 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */ 00531 "movq %%mm5, %%mm7 \n\t" /* mm7 = a0 */ 00532 00533 "movq %%mm4, 1*8*2(%1)\n\t" /* save y1 */ 00534 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */ 00535 00536 "movq %%mm3, 2*8*2(%1)\n\t" /* save y2 */ 00537 "paddsw %%mm1, %%mm5 \n\t" /* mm5 = a0+b0 */ 00538 00539 "movq 3*8*2(%1), %%mm4 \n\t" /* mm4 = b3 */ 00540 "psubsw %%mm1, %%mm7 \n\t" /* mm7 = a0-b0 */ 00541 00542 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */ 00543 "movq %%mm0, %%mm3 \n\t" /* mm3 = a3 */ 00544 00545 "movq %%mm2, 5*8*2(%1)\n\t" /* save y5 */ 00546 "psubsw %%mm4, %%mm3 \n\t" /* mm3 = a3-b3 */ 00547 00548 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */ 00549 "paddsw %%mm0, %%mm4 \n\t" /* mm4 = a3+b3 */ 00550 00551 "movq %%mm5, 0*8*2(%1)\n\t" /* save y0 */ 00552 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */ 00553 00554 "movq %%mm6, 6*8*2(%1)\n\t" /* save y6 */ 00555 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */ 00556 00557 "movq %%mm7, 7*8*2(%1)\n\t" /* save y7 */ 00558 00559 "movq %%mm3, 4*8*2(%1)\n\t" /* save y4 */ 00560 00561 "movq %%mm4, 3*8*2(%1)\n\t" /* save y3 */ 00562 :: "r" (t1_vector), "r" (col+offset) 00563 ); 00564 00565 #undef T1 00566 #undef T2 00567 #undef T3 00568 #undef C4 00569 } 00570 00571 00572 DECLARE_ALIGNED(8, static const int32_t, rounder0)[] = 00573 rounder ((1 << (COL_SHIFT - 1)) - 0.5); 00574 DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0); 00575 DECLARE_ALIGNED(8, static const int32_t, rounder1)[] = 00576 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ 00577 DECLARE_ALIGNED(8, static const int32_t, rounder7)[] = 00578 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ 00579 DECLARE_ALIGNED(8, static const int32_t, rounder2)[] = 00580 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ 00581 DECLARE_ALIGNED(8, static const int32_t, rounder6)[] = 00582 rounder (-0.25); /* C2 * (C6-C2)/2 */ 00583 DECLARE_ALIGNED(8, static const int32_t, rounder3)[] = 00584 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ 00585 DECLARE_ALIGNED(8, static const int32_t, rounder5)[] = 00586 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ 00587 00588 #undef COL_SHIFT 00589 #undef ROW_SHIFT 00590 00591 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ 00592 void idct (int16_t * const block) \ 00593 { \ 00594 DECLARE_ALIGNED(16, static const int16_t, table04)[] = \ 00595 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ 00596 DECLARE_ALIGNED(16, static const int16_t, table17)[] = \ 00597 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ 00598 DECLARE_ALIGNED(16, static const int16_t, table26)[] = \ 00599 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ 00600 DECLARE_ALIGNED(16, static const int16_t, table35)[] = \ 00601 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ 00602 \ 00603 idct_row_head (block, 0*8, table04); \ 00604 idct_row (table04, rounder0); \ 00605 idct_row_mid (block, 0*8, 4*8, table04); \ 00606 idct_row (table04, rounder4); \ 00607 idct_row_mid (block, 4*8, 1*8, table17); \ 00608 idct_row (table17, rounder1); \ 00609 idct_row_mid (block, 1*8, 7*8, table17); \ 00610 idct_row (table17, rounder7); \ 00611 idct_row_mid (block, 7*8, 2*8, table26); \ 00612 idct_row (table26, rounder2); \ 00613 idct_row_mid (block, 2*8, 6*8, table26); \ 00614 idct_row (table26, rounder6); \ 00615 idct_row_mid (block, 6*8, 3*8, table35); \ 00616 idct_row (table35, rounder3); \ 00617 idct_row_mid (block, 3*8, 5*8, table35); \ 00618 idct_row (table35, rounder5); \ 00619 idct_row_tail (block, 5*8); \ 00620 \ 00621 idct_col (block, 0); \ 00622 idct_col (block, 4); \ 00623 } 00624 00625 declare_idct (ff_mmxext_idct, mmxext_table, 00626 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) 00627 00628 declare_idct (ff_mmx_idct, mmx_table, 00629 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) 00630