• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/ps2/idct_mmi.c

Go to the documentation of this file.
00001 /*
00002  * Originally provided by Intel at Application Note AP-922.
00003  *
00004  * Column code adapted from Peter Gubanov.
00005  * Copyright (c) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
00006  * http://www.elecard.com/peter/idct.shtml
00007  * rounding trick copyright (c) 2000 Michel Lespinasse <walken@zoy.org>
00008  *
00009  * MMI port and (c) 2002 by Leon van Stuivenberg
00010  *
00011  * This file is part of FFmpeg.
00012  *
00013  * FFmpeg is free software; you can redistribute it and/or
00014  * modify it under the terms of the GNU Lesser General Public
00015  * License as published by the Free Software Foundation; either
00016  * version 2.1 of the License, or (at your option) any later version.
00017  *
00018  * FFmpeg is distributed in the hope that it will be useful,
00019  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00020  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00021  * Lesser General Public License for more details.
00022  *
00023  * You should have received a copy of the GNU Lesser General Public
00024  * License along with FFmpeg; if not, write to the Free Software
00025  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00026  */
00027 
00028 #include "libavutil/common.h"
00029 #include "libavcodec/dsputil.h"
00030 #include "mmi.h"
00031 
00032 #define BITS_INV_ACC    5       // 4 or 5 for IEEE
00033 #define SHIFT_INV_ROW   (16 - BITS_INV_ACC)
00034 #define SHIFT_INV_COL   (1 + BITS_INV_ACC)
00035 
00036 #define TG1             6518
00037 #define TG2             13573
00038 #define TG3             21895
00039 #define CS4             23170
00040 
00041 #define ROUNDER_0       0
00042 #define ROUNDER_1       16
00043 
00044 #define TAB_i_04        (32+0)
00045 #define TAB_i_17        (32+64)
00046 #define TAB_i_26        (32+128)
00047 #define TAB_i_35        (32+192)
00048 
00049 #define TG_1_16         (32+256+0)
00050 #define TG_2_16         (32+256+16)
00051 #define TG_3_16         (32+256+32)
00052 #define COS_4_16        (32+256+48)
00053 
00054 #define CLIPMAX         (32+256+64+0)
00055 
00056 static short consttable[] align16 = {
00057 /* rounder 0*/  // assume SHIFT_INV_ROW == 11
00058  0x3ff, 1, 0x3ff, 1, 0x3ff, 1, 0x3ff, 1,
00059 /* rounder 1*/
00060  0x3ff, 0, 0x3ff, 0, 0x3ff, 0, 0x3ff, 0,
00061 /* row 0/4*/
00062  16384,  21407, -16384, -21407,  22725,  19266, -22725, -12873,
00063   8867,  16384,   8867,  16384,   4520,  12873,  -4520,  19266,
00064  16384,  -8867,  16384,  -8867,  12873, -22725,  19266, -22725,
00065  21407, -16384, -21407,  16384,  19266,   4520, -12873,   4520,
00066 /* row 1/7*/
00067  22725,  29692, -22725, -29692,  31521,  26722, -31521, -17855,
00068  12299,  22725,  12299,  22725,   6270,  17855,  -6270,  26722,
00069  22725, -12299,  22725, -12299,  17855, -31521,  26722, -31521,
00070  29692, -22725, -29692,  22725,  26722,   6270, -17855,   6270,
00071 /* row 2/6*/
00072  21407,  27969, -21407, -27969,  29692,  25172, -29692, -16819,
00073  11585,  21407,  11585,  21407,   5906,  16819,  -5906,  25172,
00074  21407, -11585,  21407, -11585,  16819, -29692,  25172, -29692,
00075  27969, -21407, -27969,  21407,  25172,   5906, -16819,   5906,
00076 /*row 3/5*/
00077  19266,  25172, -19266, -25172,  26722,  22654, -26722, -15137,
00078  10426,  19266,  10426,  19266,   5315,  15137,  -5315,  22654,
00079  19266, -10426,  19266, -10426,  15137, -26722,  22654, -26722,
00080  25172, -19266, -25172,  19266,  22654,   5315, -15137,   5315,
00081 /*column constants*/
00082  TG1, TG1, TG1, TG1, TG1, TG1, TG1, TG1,
00083  TG2, TG2, TG2, TG2, TG2, TG2, TG2, TG2,
00084  TG3, TG3, TG3, TG3, TG3, TG3, TG3, TG3,
00085  CS4, CS4, CS4, CS4, CS4, CS4, CS4, CS4,
00086 /* clamp */
00087  255, 255, 255, 255, 255, 255, 255, 255
00088 };
00089 
00090 
00091 #define DCT_8_INV_ROW1(blk, rowoff, taboff, rnd, outreg) { \
00092         lq(blk, rowoff, $16);   /* r16 = x7  x5  x3  x1  x6  x4  x2  x0 */ \
00093         /*slot*/ \
00094         lq($24, 0+taboff, $17); /* r17 = w */ \
00095         /*delay slot $16*/ \
00096         lq($24, 16+taboff, $18);/* r18 = w */ \
00097         prevh($16, $2);         /* r2  = x1  x3  x5  x7  x0  x2  x4  x6 */ \
00098         lq($24, 32+taboff, $19);/* r19 = w */ \
00099         phmadh($17, $16, $17);  /* r17 = b1"b0'a1"a0' */ \
00100         lq($24, 48+taboff, $20);/* r20 = w */ \
00101         phmadh($18, $2, $18);   /* r18 = b1'b0"a1'a0" */ \
00102         phmadh($19, $16, $19);  /* r19 = b3"b2'a3"a2' */ \
00103         phmadh($20, $2, $20);   /* r20 = b3'b2"a3'a2" */ \
00104         paddw($17, $18, $17);   /* r17 = (b1)(b0)(a1)(a0) */ \
00105         paddw($19, $20, $19);   /* r19 = (b3)(b2)(a3)(a2) */ \
00106         pcpyld($19, $17, $18);  /* r18 = (a3)(a2)(a1)(a0) */ \
00107         pcpyud($17, $19, $20);  /* r20 = (b3)(b2)(b1)(b0) */ \
00108         paddw($18, rnd, $18);   /* r18 = (a3)(a2)(a1)(a0) */\
00109         paddw($18, $20, $17);   /* r17 = ()()()(a0+b0) */ \
00110         psubw($18, $20, $20);   /* r20 = ()()()(a0-b0) */ \
00111         psraw($17, SHIFT_INV_ROW, $17); /* r17 = (y3 y2 y1 y0) */ \
00112         psraw($20, SHIFT_INV_ROW, $20); /* r20 = (y4 y5 y6 y7) */ \
00113         ppach($20, $17, outreg);/* out = y4 y5 y6 y7 y3 y2 y1 y0  Note order */ \
00114 \
00115         prevh(outreg, $2);        \
00116         pcpyud($2, $2, $2);        \
00117         pcpyld($2, outreg, outreg);        \
00118 }
00119 
00120 
00121 #define DCT_8_INV_COL8() \
00122 \
00123         lq($24, TG_3_16, $2);   /* r2  = tn3 */         \
00124 \
00125         pmulth($11, $2, $17);   /* r17 = x3 * tn3 (6420) */ \
00126         psraw($17, 15, $17);    \
00127         pmfhl_uw($3);           /* r3  = 7531 */        \
00128         psraw($3, 15, $3);      \
00129         pinteh($3, $17, $17);   /* r17 = x3 * tn3 */    \
00130         psubh($17, $13, $17);   /* r17 = tm35 */        \
00131 \
00132         pmulth($13, $2, $18);   /* r18 = x5 * tn3 (6420) */ \
00133         psraw($18, 15, $18);    \
00134         pmfhl_uw($3);           /* r3  = 7531 */        \
00135         psraw($3, 15, $3);      \
00136         pinteh($3, $18, $18);   /* r18 = x5 * tn3 */    \
00137         paddh($18, $11, $18);   /* r18 = tp35 */        \
00138 \
00139         lq($24, TG_1_16, $2);   /* r2  = tn1 */         \
00140 \
00141         pmulth($15, $2, $19);   /* r19 = x7 * tn1 (6420) */ \
00142         psraw($19, 15, $19);    \
00143         pmfhl_uw($3);           /* r3  = 7531 */        \
00144         psraw($3, 15, $3);      \
00145         pinteh($3, $19, $19);   /* r19 = x7 * tn1 */    \
00146         paddh($19, $9, $19);    /* r19 = tp17 */        \
00147 \
00148         pmulth($9, $2, $20);    /* r20 = x1 * tn1 (6420) */ \
00149         psraw($20, 15, $20);    \
00150         pmfhl_uw($3);           /* r3  = 7531 */        \
00151         psraw($3, 15, $3);      \
00152         pinteh($3, $20, $20);   /* r20 = x1 * tn1 */    \
00153         psubh($20, $15, $20);   /* r20 = tm17 */        \
00154 \
00155         psubh($19, $18, $3);    /* r3  = t1 */          \
00156         paddh($20, $17, $16);   /* r16 = t2 */          \
00157         psubh($20, $17, $23);   /* r23 = b3 */          \
00158         paddh($19, $18, $20);   /* r20 = b0 */          \
00159 \
00160         lq($24, COS_4_16, $2);  /* r2  = cs4 */         \
00161 \
00162         paddh($3, $16, $21);    /* r21 = t1+t2 */       \
00163         psubh($3, $16, $22);    /* r22 = t1-t2 */       \
00164 \
00165         pmulth($21, $2, $21);   /* r21 = cs4 * (t1+t2) 6420 */ \
00166         psraw($21, 15, $21);    \
00167         pmfhl_uw($3);           /* r3  = 7531 */        \
00168         psraw($3, 15, $3);      \
00169         pinteh($3, $21, $21);   /* r21 = b1 */          \
00170 \
00171         pmulth($22, $2, $22);   /* r22 = cs4 * (t1-t2) 6420 */ \
00172         psraw($22, 15, $22);    \
00173         pmfhl_uw($3);           /* r3  = 7531 */        \
00174         psraw($3, 15, $3);      \
00175         pinteh($3, $22, $22);   /* r22 = b2 */          \
00176 \
00177         lq($24, TG_2_16, $2);   /* r2  = tn2 */         \
00178 \
00179         pmulth($10, $2, $17);   /* r17 = x2 * tn2 (6420) */ \
00180         psraw($17, 15, $17);    \
00181         pmfhl_uw($3);           /* r3  = 7531 */        \
00182         psraw($3, 15, $3);      \
00183         pinteh($3, $17, $17);   /* r17 = x3 * tn3 */    \
00184         psubh($17, $14, $17);   /* r17 = tm26 */        \
00185 \
00186         pmulth($14, $2, $18);   /* r18 = x6 * tn2 (6420) */ \
00187         psraw($18, 15, $18);    \
00188         pmfhl_uw($3);           /* r3  = 7531 */        \
00189         psraw($3, 15, $3);      \
00190         pinteh($3, $18, $18);   /* r18 = x6 * tn2 */    \
00191         paddh($18, $10, $18);   /* r18 = tp26 */        \
00192 \
00193         paddh($8, $12, $2);     /* r2  = tp04 */        \
00194         psubh($8, $12, $3);     /* r3  = tm04 */        \
00195 \
00196         paddh($2, $18, $16);    /* r16 = a0 */          \
00197         psubh($2, $18, $19);    /* r19 = a3 */          \
00198         psubh($3, $17, $18);    /* r18 = a2 */          \
00199         paddh($3, $17, $17);    /* r17 = a1 */
00200 
00201 
00202 #define DCT_8_INV_COL8_STORE(blk) \
00203 \
00204         paddh($16, $20, $2);    /* y0  a0+b0 */ \
00205         psubh($16, $20, $16);   /* y7  a0-b0 */ \
00206         psrah($2, SHIFT_INV_COL, $2);           \
00207         psrah($16, SHIFT_INV_COL, $16);         \
00208         sq($2, 0, blk);                         \
00209         sq($16, 112, blk);                      \
00210 \
00211         paddh($17, $21, $3);    /* y1  a1+b1 */ \
00212         psubh($17, $21, $17);   /* y6  a1-b1 */ \
00213         psrah($3, SHIFT_INV_COL, $3);           \
00214         psrah($17, SHIFT_INV_COL, $17);         \
00215         sq($3, 16, blk);                        \
00216         sq($17, 96, blk);                       \
00217 \
00218         paddh($18, $22, $2);    /* y2  a2+b2 */ \
00219         psubh($18, $22, $18);   /* y5  a2-b2 */ \
00220         psrah($2, SHIFT_INV_COL, $2);           \
00221         psrah($18, SHIFT_INV_COL, $18);         \
00222         sq($2, 32, blk);                        \
00223         sq($18, 80, blk);                       \
00224 \
00225         paddh($19, $23, $3);    /* y3  a3+b3 */ \
00226         psubh($19, $23, $19);   /* y4  a3-b3 */ \
00227         psrah($3, SHIFT_INV_COL, $3);           \
00228         psrah($19, SHIFT_INV_COL, $19);         \
00229         sq($3, 48, blk);                        \
00230         sq($19, 64, blk);
00231 
00232 
00233 
00234 #define DCT_8_INV_COL8_PMS() \
00235         paddh($16, $20, $2);    /* y0  a0+b0 */ \
00236         psubh($16, $20, $20);   /* y7  a0-b0 */ \
00237         psrah($2, SHIFT_INV_COL, $16);          \
00238         psrah($20, SHIFT_INV_COL, $20);         \
00239 \
00240         paddh($17, $21, $3);    /* y1  a1+b1 */ \
00241         psubh($17, $21, $21);   /* y6  a1-b1 */ \
00242         psrah($3, SHIFT_INV_COL, $17);          \
00243         psrah($21, SHIFT_INV_COL, $21);         \
00244 \
00245         paddh($18, $22, $2);    /* y2  a2+b2 */ \
00246         psubh($18, $22, $22);   /* y5  a2-b2 */ \
00247         psrah($2, SHIFT_INV_COL, $18);          \
00248         psrah($22, SHIFT_INV_COL, $22);         \
00249 \
00250         paddh($19, $23, $3);    /* y3  a3+b3 */ \
00251         psubh($19, $23, $23);   /* y4  a3-b3 */ \
00252         psrah($3, SHIFT_INV_COL, $19);          \
00253         psrah($23, SHIFT_INV_COL, $23);
00254 
00255 #define PUT(rs)                 \
00256         pminh(rs, $11, $2);     \
00257         pmaxh($2, $0, $2);      \
00258         ppacb($0, $2, $2);      \
00259         sd3(2, 0, 4);           \
00260         __asm__ volatile ("add $4, $5, $4");
00261 
00262 #define DCT_8_INV_COL8_PUT() \
00263         PUT($16);        \
00264         PUT($17);        \
00265         PUT($18);        \
00266         PUT($19);        \
00267         PUT($23);        \
00268         PUT($22);        \
00269         PUT($21);        \
00270         PUT($20);
00271 
00272 #define ADD(rs)          \
00273         ld3(4, 0, 2);        \
00274         pextlb($0, $2, $2);  \
00275         paddh($2, rs, $2);   \
00276         pminh($2, $11, $2);  \
00277         pmaxh($2, $0, $2);   \
00278         ppacb($0, $2, $2);   \
00279         sd3(2, 0, 4); \
00280         __asm__ volatile ("add $4, $5, $4");
00281 
00282 /*fixme: schedule*/
00283 #define DCT_8_INV_COL8_ADD() \
00284         ADD($16);        \
00285         ADD($17);        \
00286         ADD($18);        \
00287         ADD($19);        \
00288         ADD($23);        \
00289         ADD($22);        \
00290         ADD($21);        \
00291         ADD($20);
00292 
00293 
00294 void ff_mmi_idct(int16_t * block)
00295 {
00296         /* $4 = block */
00297         __asm__ volatile("la $24, %0"::"m"(consttable[0]));
00298         lq($24, ROUNDER_0, $8);
00299         lq($24, ROUNDER_1, $7);
00300         DCT_8_INV_ROW1($4, 0, TAB_i_04, $8, $8);
00301         DCT_8_INV_ROW1($4, 16, TAB_i_17, $7, $9);
00302         DCT_8_INV_ROW1($4, 32, TAB_i_26, $7, $10);
00303         DCT_8_INV_ROW1($4, 48, TAB_i_35, $7, $11);
00304         DCT_8_INV_ROW1($4, 64, TAB_i_04, $7, $12);
00305         DCT_8_INV_ROW1($4, 80, TAB_i_35, $7, $13);
00306         DCT_8_INV_ROW1($4, 96, TAB_i_26, $7, $14);
00307         DCT_8_INV_ROW1($4, 112, TAB_i_17, $7, $15);
00308         DCT_8_INV_COL8();
00309         DCT_8_INV_COL8_STORE($4);
00310 
00311         //let savedtemp regs be saved
00312         __asm__ volatile(" ":::"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
00313 }
00314 
00315 
00316 void ff_mmi_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
00317 {
00318         /* $4 = dest, $5 = line_size, $6 = block */
00319         __asm__ volatile("la $24, %0"::"m"(consttable[0]));
00320         lq($24, ROUNDER_0, $8);
00321         lq($24, ROUNDER_1, $7);
00322         DCT_8_INV_ROW1($6, 0, TAB_i_04, $8, $8);
00323         DCT_8_INV_ROW1($6, 16, TAB_i_17, $7, $9);
00324         DCT_8_INV_ROW1($6, 32, TAB_i_26, $7, $10);
00325         DCT_8_INV_ROW1($6, 48, TAB_i_35, $7, $11);
00326         DCT_8_INV_ROW1($6, 64, TAB_i_04, $7, $12);
00327         DCT_8_INV_ROW1($6, 80, TAB_i_35, $7, $13);
00328         DCT_8_INV_ROW1($6, 96, TAB_i_26, $7, $14);
00329         DCT_8_INV_ROW1($6, 112, TAB_i_17, $7, $15);
00330         DCT_8_INV_COL8();
00331         lq($24, CLIPMAX, $11);
00332         DCT_8_INV_COL8_PMS();
00333         DCT_8_INV_COL8_PUT();
00334 
00335         //let savedtemp regs be saved
00336         __asm__ volatile(" ":::"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
00337 }
00338 
00339 
00340 void ff_mmi_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
00341 {
00342         /* $4 = dest, $5 = line_size, $6 = block */
00343         __asm__ volatile("la $24, %0"::"m"(consttable[0]));
00344         lq($24, ROUNDER_0, $8);
00345         lq($24, ROUNDER_1, $7);
00346         DCT_8_INV_ROW1($6, 0, TAB_i_04, $8, $8);
00347         DCT_8_INV_ROW1($6, 16, TAB_i_17, $7, $9);
00348         DCT_8_INV_ROW1($6, 32, TAB_i_26, $7, $10);
00349         DCT_8_INV_ROW1($6, 48, TAB_i_35, $7, $11);
00350         DCT_8_INV_ROW1($6, 64, TAB_i_04, $7, $12);
00351         DCT_8_INV_ROW1($6, 80, TAB_i_35, $7, $13);
00352         DCT_8_INV_ROW1($6, 96, TAB_i_26, $7, $14);
00353         DCT_8_INV_ROW1($6, 112, TAB_i_17, $7, $15);
00354         DCT_8_INV_COL8();
00355         lq($24, CLIPMAX, $11);
00356         DCT_8_INV_COL8_PMS();
00357         DCT_8_INV_COL8_ADD();
00358 
00359         //let savedtemp regs be saved
00360         __asm__ volatile(" ":::"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
00361 }
00362 

Generated on Fri Sep 16 2011 17:17:41 for FFmpeg by  doxygen 1.7.1