Libav
|
00001 /* 00002 * SPARC VIS optimized inverse DCT 00003 * Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu > 00004 * 00005 * I did consult the following fine web page about dct 00006 * http://www.geocities.com/ssavekar/dct.htm 00007 * 00008 * This file is part of FFmpeg. 00009 * 00010 * FFmpeg is free software; you can redistribute it and/or 00011 * modify it under the terms of the GNU Lesser General Public 00012 * License as published by the Free Software Foundation; either 00013 * version 2.1 of the License, or (at your option) any later version. 00014 * 00015 * FFmpeg is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 * Lesser General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU Lesser General Public 00021 * License along with FFmpeg; if not, write to the Free Software 00022 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00023 */ 00024 00025 #include "libavcodec/dsputil.h" 00026 #include "dsputil_vis.h" 00027 00028 static const DECLARE_ALIGNED(8, int16_t, coeffs)[28] = { 00029 - 1259,- 1259,- 1259,- 1259, 00030 - 4989,- 4989,- 4989,- 4989, 00031 -11045,-11045,-11045,-11045, 00032 -19195,-19195,-19195,-19195, 00033 -29126,-29126,-29126,-29126, 00034 25080, 25080, 25080, 25080, 00035 12785, 12785, 12785, 12785 00036 }; 00037 static const DECLARE_ALIGNED(8, uint16_t, scale)[4] = { 00038 65536>>6, 65536>>6, 65536>>6, 65536>>6 00039 }; 00040 static const DECLARE_ALIGNED(8, uint16_t, rounder)[4] = { 00041 1<<5, 1<<5, 1<<5, 1<<5 00042 }; 00043 static const DECLARE_ALIGNED(8, uint16_t, expand)[4] = { 00044 1<<14, 1<<14, 1<<14, 1<<14 00045 }; 00046 00047 #define INIT_IDCT \ 00048 "ldd [%1], %%f32 \n\t"\ 00049 "ldd [%1+8], %%f34 \n\t"\ 00050 "ldd [%1+16], %%f36 \n\t"\ 00051 "ldd [%1+24], %%f38 \n\t"\ 00052 "ldd [%1+32], %%f40 \n\t"\ 00053 "ldd [%1+40], %%f42 \n\t"\ 00054 "ldd [%1+48], %%f44 \n\t"\ 00055 "ldd [%0], %%f46 \n\t"\ 00056 "fzero %%f62 \n\t"\ 00057 00058 #define LOADSCALE(in) \ 00059 "ldd [" in "], %%f0 \n\t"\ 00060 "ldd [" in "+16], %%f2 \n\t"\ 00061 "ldd [" in "+32], %%f4 \n\t"\ 00062 "ldd [" in "+48], %%f6 \n\t"\ 00063 "ldd [" in "+64], %%f8 \n\t"\ 00064 "ldd [" in "+80], %%f10 \n\t"\ 00065 "ldd [" in "+96], %%f12 \n\t"\ 00066 "ldd [" in "+112], %%f14 \n\t"\ 00067 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00068 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00069 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00070 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00071 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00072 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00073 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00074 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00075 \ 00076 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00077 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00078 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00079 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00080 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00081 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00082 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00083 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00084 \ 00085 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00086 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00087 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00088 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00089 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00090 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00091 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00092 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00093 \ 00094 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00095 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00096 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00097 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00098 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00099 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00100 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00101 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00102 00103 #define LOAD(in) \ 00104 "ldd [" in "], %%f16 \n\t"\ 00105 "ldd [" in "+8], %%f18 \n\t"\ 00106 "ldd [" in "+16], %%f20 \n\t"\ 00107 "ldd [" in "+24], %%f22 \n\t"\ 00108 "ldd [" in "+32], %%f24 \n\t"\ 00109 "ldd [" in "+40], %%f26 \n\t"\ 00110 "ldd [" in "+48], %%f28 \n\t"\ 00111 "ldd [" in "+56], %%f30 \n\t"\ 00112 00113 #define TRANSPOSE \ 00114 "fpmerge %%f16, %%f24, %%f0 \n\t"\ 00115 "fpmerge %%f20, %%f28, %%f2 \n\t"\ 00116 "fpmerge %%f17, %%f25, %%f4 \n\t"\ 00117 "fpmerge %%f21, %%f29, %%f6 \n\t"\ 00118 "fpmerge %%f18, %%f26, %%f8 \n\t"\ 00119 "fpmerge %%f22, %%f30, %%f10 \n\t"\ 00120 "fpmerge %%f19, %%f27, %%f12 \n\t"\ 00121 "fpmerge %%f23, %%f31, %%f14 \n\t"\ 00122 \ 00123 "fpmerge %%f0, %%f2, %%f16 \n\t"\ 00124 "fpmerge %%f1, %%f3, %%f18 \n\t"\ 00125 "fpmerge %%f4, %%f6, %%f20 \n\t"\ 00126 "fpmerge %%f5, %%f7, %%f22 \n\t"\ 00127 "fpmerge %%f8, %%f10, %%f24 \n\t"\ 00128 "fpmerge %%f9, %%f11, %%f26 \n\t"\ 00129 "fpmerge %%f12, %%f14, %%f28 \n\t"\ 00130 "fpmerge %%f13, %%f15, %%f30 \n\t"\ 00131 \ 00132 "fpmerge %%f16, %%f17, %%f0 \n\t"\ 00133 "fpmerge %%f18, %%f19, %%f2 \n\t"\ 00134 "fpmerge %%f20, %%f21, %%f4 \n\t"\ 00135 "fpmerge %%f22, %%f23, %%f6 \n\t"\ 00136 "fpmerge %%f24, %%f25, %%f8 \n\t"\ 00137 "fpmerge %%f26, %%f27, %%f10 \n\t"\ 00138 "fpmerge %%f28, %%f29, %%f12 \n\t"\ 00139 "fpmerge %%f30, %%f31, %%f14 \n\t"\ 00140 00141 #define IDCT4ROWS \ 00142 /* 1. column */\ 00143 "fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\ 00144 "for %%f4, %%f6, %%f60 \n\t"\ 00145 "fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\ 00146 "fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\ 00147 "fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\ 00148 "fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\ 00149 \ 00150 ADDROUNDER\ 00151 \ 00152 "fmul8sux16 %%f0, %%f38, %%f48 \n\t"\ 00153 "fcmpd %%fcc0, %%f62, %%f60 \n\t"\ 00154 "for %%f8, %%f10, %%f60 \n\t"\ 00155 "fmul8sux16 %%f2, %%f32, %%f50 \n\t"\ 00156 "fmul8sux16 %%f2, %%f36, %%f52 \n\t"\ 00157 "fmul8sux16 %%f2, %%f40, %%f54 \n\t"\ 00158 "fmul8sux16 %%f2, %%f44, %%f56 \n\t"\ 00159 \ 00160 "fpadd16 %%f48, %%f28, %%f28 \n\t"\ 00161 "fcmpd %%fcc1, %%f62, %%f60 \n\t"\ 00162 "for %%f12, %%f14, %%f60 \n\t"\ 00163 "fpadd16 %%f50, %%f18, %%f18 \n\t"\ 00164 "fpadd16 %%f52, %%f22, %%f22 \n\t"\ 00165 "fpadd16 %%f54, %%f26, %%f26 \n\t"\ 00166 "fpadd16 %%f56, %%f30, %%f30 \n\t"\ 00167 \ 00168 "fpadd16 %%f28, %%f0, %%f16 \n\t"\ 00169 "fcmpd %%fcc2, %%f62, %%f60 \n\t"\ 00170 "fpadd16 %%f28, %%f0, %%f20 \n\t"\ 00171 "fpadd16 %%f28, %%f0, %%f24 \n\t"\ 00172 "fpadd16 %%f28, %%f0, %%f28 \n\t"\ 00173 "fpadd16 %%f18, %%f2, %%f18 \n\t"\ 00174 "fpadd16 %%f22, %%f2, %%f22 \n\t"\ 00175 /* 2. column */\ 00176 "fbe %%fcc0, 3f \n\t"\ 00177 "fpadd16 %%f26, %%f2, %%f26 \n\t"\ 00178 "fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\ 00179 "fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\ 00180 "fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\ 00181 "fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\ 00182 "fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\ 00183 "fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\ 00184 \ 00185 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00186 "fpadd16 %%f20, %%f50, %%f20 \n\t"\ 00187 "fpsub16 %%f24, %%f50, %%f24 \n\t"\ 00188 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00189 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00190 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00191 "fpsub16 %%f26, %%f56, %%f26 \n\t"\ 00192 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00193 \ 00194 "fmul8sux16 %%f4, %%f34, %%f48 \n\t"\ 00195 "fmul8sux16 %%f4, %%f42, %%f50 \n\t"\ 00196 "fmul8sux16 %%f6, %%f36, %%f52 \n\t"\ 00197 "fmul8sux16 %%f6, %%f44, %%f54 \n\t"\ 00198 "fmul8sux16 %%f6, %%f32, %%f56 \n\t"\ 00199 "fmul8sux16 %%f6, %%f40, %%f58 \n\t"\ 00200 \ 00201 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00202 "fpadd16 %%f20, %%f50, %%f20 \n\t"\ 00203 "fpsub16 %%f24, %%f50, %%f24 \n\t"\ 00204 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00205 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00206 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00207 "fpsub16 %%f26, %%f56, %%f26 \n\t"\ 00208 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00209 \ 00210 "fpadd16 %%f16, %%f4, %%f16 \n\t"\ 00211 "fpsub16 %%f28, %%f4, %%f28 \n\t"\ 00212 "fpadd16 %%f18, %%f6, %%f18 \n\t"\ 00213 "fpsub16 %%f26, %%f6, %%f26 \n\t"\ 00214 /* 3. column */\ 00215 "3: \n\t"\ 00216 "fbe %%fcc1, 4f \n\t"\ 00217 "fpsub16 %%f30, %%f6, %%f30 \n\t"\ 00218 "fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\ 00219 "fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\ 00220 "fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\ 00221 "fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\ 00222 "fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\ 00223 \ 00224 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00225 "fpsub16 %%f20, %%f48, %%f20 \n\t"\ 00226 "fpsub16 %%f24, %%f48, %%f24 \n\t"\ 00227 "fpadd16 %%f28, %%f48, %%f28 \n\t"\ 00228 "fpadd16 %%f18, %%f50, %%f18 \n\t"\ 00229 "fpsub16 %%f22, %%f52, %%f22 \n\t"\ 00230 "fpadd16 %%f26, %%f54, %%f26 \n\t"\ 00231 "fpadd16 %%f30, %%f56, %%f30 \n\t"\ 00232 \ 00233 "fmul8sux16 %%f8, %%f38, %%f48 \n\t"\ 00234 "fmul8sux16 %%f10, %%f40, %%f50 \n\t"\ 00235 "fmul8sux16 %%f10, %%f32, %%f52 \n\t"\ 00236 "fmul8sux16 %%f10, %%f44, %%f54 \n\t"\ 00237 "fmul8sux16 %%f10, %%f36, %%f56 \n\t"\ 00238 \ 00239 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00240 "fpsub16 %%f20, %%f48, %%f20 \n\t"\ 00241 "fpsub16 %%f24, %%f48, %%f24 \n\t"\ 00242 "fpadd16 %%f28, %%f48, %%f28 \n\t"\ 00243 "fpadd16 %%f18, %%f50, %%f18 \n\t"\ 00244 "fpsub16 %%f22, %%f52, %%f22 \n\t"\ 00245 "fpadd16 %%f26, %%f54, %%f26 \n\t"\ 00246 "fpadd16 %%f30, %%f56, %%f30 \n\t"\ 00247 \ 00248 "fpadd16 %%f16, %%f8, %%f16 \n\t"\ 00249 "fpsub16 %%f20, %%f8, %%f20 \n\t"\ 00250 "fpsub16 %%f24, %%f8, %%f24 \n\t"\ 00251 "fpadd16 %%f28, %%f8, %%f28 \n\t"\ 00252 "fpadd16 %%f18, %%f10, %%f18 \n\t"\ 00253 "fpsub16 %%f22, %%f10, %%f22 \n\t"\ 00254 /* 4. column */\ 00255 "4: \n\t"\ 00256 "fbe %%fcc2, 5f \n\t"\ 00257 "fpadd16 %%f30, %%f10, %%f30 \n\t"\ 00258 "fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\ 00259 "fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\ 00260 "fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\ 00261 "fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\ 00262 "fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\ 00263 "fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\ 00264 \ 00265 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00266 "fpsub16 %%f20, %%f50, %%f20 \n\t"\ 00267 "fpadd16 %%f24, %%f50, %%f24 \n\t"\ 00268 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00269 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00270 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00271 "fpadd16 %%f26, %%f56, %%f26 \n\t"\ 00272 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00273 \ 00274 "fmul8sux16 %%f12, %%f42, %%f48 \n\t"\ 00275 "fmul8sux16 %%f12, %%f34, %%f50 \n\t"\ 00276 "fmul8sux16 %%f14, %%f44, %%f52 \n\t"\ 00277 "fmul8sux16 %%f14, %%f40, %%f54 \n\t"\ 00278 "fmul8sux16 %%f14, %%f36, %%f56 \n\t"\ 00279 "fmul8sux16 %%f14, %%f32, %%f58 \n\t"\ 00280 \ 00281 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00282 "fpsub16 %%f20, %%f50, %%f20 \n\t"\ 00283 "fpadd16 %%f24, %%f50, %%f24 \n\t"\ 00284 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00285 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00286 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00287 "fpadd16 %%f26, %%f56, %%f26 \n\t"\ 00288 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00289 \ 00290 "fpsub16 %%f20, %%f12, %%f20 \n\t"\ 00291 "fpadd16 %%f24, %%f12, %%f24 \n\t"\ 00292 "fpsub16 %%f22, %%f14, %%f22 \n\t"\ 00293 "fpadd16 %%f26, %%f14, %%f26 \n\t"\ 00294 "fpsub16 %%f30, %%f14, %%f30 \n\t"\ 00295 /* final butterfly */\ 00296 "5: \n\t"\ 00297 "fpsub16 %%f16, %%f18, %%f48 \n\t"\ 00298 "fpsub16 %%f20, %%f22, %%f50 \n\t"\ 00299 "fpsub16 %%f24, %%f26, %%f52 \n\t"\ 00300 "fpsub16 %%f28, %%f30, %%f54 \n\t"\ 00301 "fpadd16 %%f16, %%f18, %%f16 \n\t"\ 00302 "fpadd16 %%f20, %%f22, %%f20 \n\t"\ 00303 "fpadd16 %%f24, %%f26, %%f24 \n\t"\ 00304 "fpadd16 %%f28, %%f30, %%f28 \n\t"\ 00305 00306 #define STOREROWS(out) \ 00307 "std %%f48, [" out "+112] \n\t"\ 00308 "std %%f50, [" out "+96] \n\t"\ 00309 "std %%f52, [" out "+80] \n\t"\ 00310 "std %%f54, [" out "+64] \n\t"\ 00311 "std %%f16, [" out "] \n\t"\ 00312 "std %%f20, [" out "+16] \n\t"\ 00313 "std %%f24, [" out "+32] \n\t"\ 00314 "std %%f28, [" out "+48] \n\t"\ 00315 00316 #define SCALEROWS \ 00317 "fmul8sux16 %%f46, %%f48, %%f48 \n\t"\ 00318 "fmul8sux16 %%f46, %%f50, %%f50 \n\t"\ 00319 "fmul8sux16 %%f46, %%f52, %%f52 \n\t"\ 00320 "fmul8sux16 %%f46, %%f54, %%f54 \n\t"\ 00321 "fmul8sux16 %%f46, %%f16, %%f16 \n\t"\ 00322 "fmul8sux16 %%f46, %%f20, %%f20 \n\t"\ 00323 "fmul8sux16 %%f46, %%f24, %%f24 \n\t"\ 00324 "fmul8sux16 %%f46, %%f28, %%f28 \n\t"\ 00325 00326 #define PUTPIXELSCLAMPED(dest) \ 00327 "fpack16 %%f48, %%f14 \n\t"\ 00328 "fpack16 %%f50, %%f12 \n\t"\ 00329 "fpack16 %%f16, %%f0 \n\t"\ 00330 "fpack16 %%f20, %%f2 \n\t"\ 00331 "fpack16 %%f24, %%f4 \n\t"\ 00332 "fpack16 %%f28, %%f6 \n\t"\ 00333 "fpack16 %%f54, %%f8 \n\t"\ 00334 "fpack16 %%f52, %%f10 \n\t"\ 00335 "st %%f0, [%3+" dest "] \n\t"\ 00336 "st %%f2, [%5+" dest "] \n\t"\ 00337 "st %%f4, [%6+" dest "] \n\t"\ 00338 "st %%f6, [%7+" dest "] \n\t"\ 00339 "st %%f8, [%8+" dest "] \n\t"\ 00340 "st %%f10, [%9+" dest "] \n\t"\ 00341 "st %%f12, [%10+" dest "] \n\t"\ 00342 "st %%f14, [%11+" dest "] \n\t"\ 00343 00344 #define ADDPIXELSCLAMPED(dest) \ 00345 "ldd [%5], %%f18 \n\t"\ 00346 "ld [%3+" dest"], %%f0 \n\t"\ 00347 "ld [%6+" dest"], %%f2 \n\t"\ 00348 "ld [%7+" dest"], %%f4 \n\t"\ 00349 "ld [%8+" dest"], %%f6 \n\t"\ 00350 "ld [%9+" dest"], %%f8 \n\t"\ 00351 "ld [%10+" dest"], %%f10 \n\t"\ 00352 "ld [%11+" dest"], %%f12 \n\t"\ 00353 "ld [%12+" dest"], %%f14 \n\t"\ 00354 "fmul8x16 %%f0, %%f18, %%f0 \n\t"\ 00355 "fmul8x16 %%f2, %%f18, %%f2 \n\t"\ 00356 "fmul8x16 %%f4, %%f18, %%f4 \n\t"\ 00357 "fmul8x16 %%f6, %%f18, %%f6 \n\t"\ 00358 "fmul8x16 %%f8, %%f18, %%f8 \n\t"\ 00359 "fmul8x16 %%f10, %%f18, %%f10 \n\t"\ 00360 "fmul8x16 %%f12, %%f18, %%f12 \n\t"\ 00361 "fmul8x16 %%f14, %%f18, %%f14 \n\t"\ 00362 "fpadd16 %%f0, %%f16, %%f0 \n\t"\ 00363 "fpadd16 %%f2, %%f20, %%f2 \n\t"\ 00364 "fpadd16 %%f4, %%f24, %%f4 \n\t"\ 00365 "fpadd16 %%f6, %%f28, %%f6 \n\t"\ 00366 "fpadd16 %%f8, %%f54, %%f8 \n\t"\ 00367 "fpadd16 %%f10, %%f52, %%f10 \n\t"\ 00368 "fpadd16 %%f12, %%f50, %%f12 \n\t"\ 00369 "fpadd16 %%f14, %%f48, %%f14 \n\t"\ 00370 "fpack16 %%f0, %%f0 \n\t"\ 00371 "fpack16 %%f2, %%f2 \n\t"\ 00372 "fpack16 %%f4, %%f4 \n\t"\ 00373 "fpack16 %%f6, %%f6 \n\t"\ 00374 "fpack16 %%f8, %%f8 \n\t"\ 00375 "fpack16 %%f10, %%f10 \n\t"\ 00376 "fpack16 %%f12, %%f12 \n\t"\ 00377 "fpack16 %%f14, %%f14 \n\t"\ 00378 "st %%f0, [%3+" dest "] \n\t"\ 00379 "st %%f2, [%6+" dest "] \n\t"\ 00380 "st %%f4, [%7+" dest "] \n\t"\ 00381 "st %%f6, [%8+" dest "] \n\t"\ 00382 "st %%f8, [%9+" dest "] \n\t"\ 00383 "st %%f10, [%10+" dest "] \n\t"\ 00384 "st %%f12, [%11+" dest "] \n\t"\ 00385 "st %%f14, [%12+" dest "] \n\t"\ 00386 00387 00388 void ff_simple_idct_vis(DCTELEM *data) { 00389 int out1, out2, out3, out4; 00390 DECLARE_ALIGNED(8, int16_t, temp)[8*8]; 00391 00392 __asm__ volatile( 00393 INIT_IDCT 00394 00395 #define ADDROUNDER 00396 00397 // shift right 16-4=12 00398 LOADSCALE("%2+8") 00399 IDCT4ROWS 00400 STOREROWS("%3+8") 00401 LOADSCALE("%2+0") 00402 IDCT4ROWS 00403 "std %%f48, [%3+112] \n\t" 00404 "std %%f50, [%3+96] \n\t" 00405 "std %%f52, [%3+80] \n\t" 00406 "std %%f54, [%3+64] \n\t" 00407 00408 // shift right 16+4 00409 "ldd [%3+8], %%f18 \n\t" 00410 "ldd [%3+24], %%f22 \n\t" 00411 "ldd [%3+40], %%f26 \n\t" 00412 "ldd [%3+56], %%f30 \n\t" 00413 TRANSPOSE 00414 IDCT4ROWS 00415 SCALEROWS 00416 STOREROWS("%2+0") 00417 LOAD("%3+64") 00418 TRANSPOSE 00419 IDCT4ROWS 00420 SCALEROWS 00421 STOREROWS("%2+8") 00422 00423 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4) 00424 : "0" (scale), "1" (coeffs), "2" (data), "3" (temp) 00425 ); 00426 } 00427 00428 void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data) { 00429 int out1, out2, out3, out4, out5; 00430 int r1, r2, r3, r4, r5, r6, r7; 00431 00432 __asm__ volatile( 00433 "wr %%g0, 0x8, %%gsr \n\t" 00434 00435 INIT_IDCT 00436 00437 "add %3, %4, %5 \n\t" 00438 "add %5, %4, %6 \n\t" 00439 "add %6, %4, %7 \n\t" 00440 "add %7, %4, %8 \n\t" 00441 "add %8, %4, %9 \n\t" 00442 "add %9, %4, %10 \n\t" 00443 "add %10, %4, %11 \n\t" 00444 00445 // shift right 16-4=12 00446 LOADSCALE("%2+8") 00447 IDCT4ROWS 00448 STOREROWS("%2+8") 00449 LOADSCALE("%2+0") 00450 IDCT4ROWS 00451 "std %%f48, [%2+112] \n\t" 00452 "std %%f50, [%2+96] \n\t" 00453 "std %%f52, [%2+80] \n\t" 00454 "std %%f54, [%2+64] \n\t" 00455 00456 #undef ADDROUNDER 00457 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t" 00458 00459 // shift right 16+4 00460 "ldd [%2+8], %%f18 \n\t" 00461 "ldd [%2+24], %%f22 \n\t" 00462 "ldd [%2+40], %%f26 \n\t" 00463 "ldd [%2+56], %%f30 \n\t" 00464 TRANSPOSE 00465 IDCT4ROWS 00466 PUTPIXELSCLAMPED("0") 00467 LOAD("%2+64") 00468 TRANSPOSE 00469 IDCT4ROWS 00470 PUTPIXELSCLAMPED("4") 00471 00472 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), 00473 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7) 00474 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size) 00475 ); 00476 } 00477 00478 void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data) { 00479 int out1, out2, out3, out4, out5, out6; 00480 int r1, r2, r3, r4, r5, r6, r7; 00481 00482 __asm__ volatile( 00483 "wr %%g0, 0x8, %%gsr \n\t" 00484 00485 INIT_IDCT 00486 00487 "add %3, %4, %6 \n\t" 00488 "add %6, %4, %7 \n\t" 00489 "add %7, %4, %8 \n\t" 00490 "add %8, %4, %9 \n\t" 00491 "add %9, %4, %10 \n\t" 00492 "add %10, %4, %11 \n\t" 00493 "add %11, %4, %12 \n\t" 00494 00495 #undef ADDROUNDER 00496 #define ADDROUNDER 00497 00498 // shift right 16-4=12 00499 LOADSCALE("%2+8") 00500 IDCT4ROWS 00501 STOREROWS("%2+8") 00502 LOADSCALE("%2+0") 00503 IDCT4ROWS 00504 "std %%f48, [%2+112] \n\t" 00505 "std %%f50, [%2+96] \n\t" 00506 "std %%f52, [%2+80] \n\t" 00507 "std %%f54, [%2+64] \n\t" 00508 00509 #undef ADDROUNDER 00510 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t" 00511 00512 // shift right 16+4 00513 "ldd [%2+8], %%f18 \n\t" 00514 "ldd [%2+24], %%f22 \n\t" 00515 "ldd [%2+40], %%f26 \n\t" 00516 "ldd [%2+56], %%f30 \n\t" 00517 TRANSPOSE 00518 IDCT4ROWS 00519 ADDPIXELSCLAMPED("0") 00520 LOAD("%2+64") 00521 TRANSPOSE 00522 IDCT4ROWS 00523 ADDPIXELSCLAMPED("4") 00524 00525 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6), 00526 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7) 00527 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand) 00528 ); 00529 }