• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/simple_idct_mmx.c

Go to the documentation of this file.
00001 /*
00002  * Simple IDCT MMX
00003  *
00004  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 #include "libavcodec/dsputil.h"
00023 #include "libavcodec/simple_idct.h"
00024 #include "dsputil_mmx.h"
00025 
00026 /*
00027 23170.475006
00028 22725.260826
00029 21406.727617
00030 19265.545870
00031 16384.000000
00032 12872.826198
00033 8866.956905
00034 4520.335430
00035 */
00036 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00037 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00038 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00039 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00040 #if 0
00041 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00042 #else
00043 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
00044 #endif
00045 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00046 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00047 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00048 
00049 #define ROW_SHIFT 11
00050 #define COL_SHIFT 20 // 6
00051 
00052 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
00053 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
00054 
00055 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
00056         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
00057 //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
00058 //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
00059         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
00060         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
00061 //        0, 0, 0, 0,
00062 //        0, 0, 0, 0,
00063 
00064  C4,  C4,  C4,  C4,
00065  C4, -C4,  C4, -C4,
00066 
00067  C2,  C6,  C2,  C6,
00068  C6, -C2,  C6, -C2,
00069 
00070  C1,  C3,  C1,  C3,
00071  C5,  C7,  C5,  C7,
00072 
00073  C3, -C7,  C3, -C7,
00074 -C1, -C5, -C1, -C5,
00075 
00076  C5, -C1,  C5, -C1,
00077  C7,  C3,  C7,  C3,
00078 
00079  C7, -C5,  C7, -C5,
00080  C3, -C1,  C3, -C1
00081 };
00082 
00083 #if 0
00084 static void unused_var_killer(void)
00085 {
00086         int a= wm1010 + d40000;
00087         temp[0]=a;
00088 }
00089 
00090 static void inline idctCol (int16_t * col, int16_t *input)
00091 {
00092 #undef C0
00093 #undef C1
00094 #undef C2
00095 #undef C3
00096 #undef C4
00097 #undef C5
00098 #undef C6
00099 #undef C7
00100         int a0, a1, a2, a3, b0, b1, b2, b3;
00101         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00102         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00103         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00104         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00105         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00106         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00107         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00108         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00109 /*
00110         if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
00111                 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
00112                         col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
00113                 return;
00114         }*/
00115 
00116 col[8*0] = input[8*0 + 0];
00117 col[8*1] = input[8*2 + 0];
00118 col[8*2] = input[8*0 + 1];
00119 col[8*3] = input[8*2 + 1];
00120 col[8*4] = input[8*4 + 0];
00121 col[8*5] = input[8*6 + 0];
00122 col[8*6] = input[8*4 + 1];
00123 col[8*7] = input[8*6 + 1];
00124 
00125         a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
00126         a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
00127         a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
00128         a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
00129 
00130         b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
00131         b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
00132         b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
00133         b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
00134 
00135         col[8*0] = (a0 + b0) >> COL_SHIFT;
00136         col[8*1] = (a1 + b1) >> COL_SHIFT;
00137         col[8*2] = (a2 + b2) >> COL_SHIFT;
00138         col[8*3] = (a3 + b3) >> COL_SHIFT;
00139         col[8*4] = (a3 - b3) >> COL_SHIFT;
00140         col[8*5] = (a2 - b2) >> COL_SHIFT;
00141         col[8*6] = (a1 - b1) >> COL_SHIFT;
00142         col[8*7] = (a0 - b0) >> COL_SHIFT;
00143 }
00144 
00145 static void inline idctRow (int16_t * output, int16_t * input)
00146 {
00147         int16_t row[8];
00148 
00149         int a0, a1, a2, a3, b0, b1, b2, b3;
00150         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00151         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00152         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00153         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00154         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00155         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00156         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00157         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00158 
00159 row[0] = input[0];
00160 row[2] = input[1];
00161 row[4] = input[4];
00162 row[6] = input[5];
00163 row[1] = input[8];
00164 row[3] = input[9];
00165 row[5] = input[12];
00166 row[7] = input[13];
00167 
00168         if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
00169                 row[0] = row[1] = row[2] = row[3] = row[4] =
00170                         row[5] = row[6] = row[7] = row[0]<<3;
00171         output[0]  = row[0];
00172         output[2]  = row[1];
00173         output[4]  = row[2];
00174         output[6]  = row[3];
00175         output[8]  = row[4];
00176         output[10] = row[5];
00177         output[12] = row[6];
00178         output[14] = row[7];
00179                 return;
00180         }
00181 
00182         a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
00183         a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
00184         a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
00185         a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
00186 
00187         b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00188         b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00189         b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00190         b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00191 
00192         row[0] = (a0 + b0) >> ROW_SHIFT;
00193         row[1] = (a1 + b1) >> ROW_SHIFT;
00194         row[2] = (a2 + b2) >> ROW_SHIFT;
00195         row[3] = (a3 + b3) >> ROW_SHIFT;
00196         row[4] = (a3 - b3) >> ROW_SHIFT;
00197         row[5] = (a2 - b2) >> ROW_SHIFT;
00198         row[6] = (a1 - b1) >> ROW_SHIFT;
00199         row[7] = (a0 - b0) >> ROW_SHIFT;
00200 
00201         output[0]  = row[0];
00202         output[2]  = row[1];
00203         output[4]  = row[2];
00204         output[6]  = row[3];
00205         output[8]  = row[4];
00206         output[10] = row[5];
00207         output[12] = row[6];
00208         output[14] = row[7];
00209 }
00210 #endif
00211 
00212 static inline void idct(int16_t *block)
00213 {
00214         DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
00215         int16_t * const temp= (int16_t*)align_tmp;
00216 
00217         __asm__ volatile(
00218 #if 0 //Alternative, simpler variant
00219 
00220 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00221         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00222         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00223         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00224         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00225         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00226         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00227         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00228         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00229         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00230         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00231         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00232         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00233         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00234         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00235         #rounder ", %%mm4               \n\t"\
00236         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00237         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00238         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00239         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00240         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00241         #rounder ", %%mm0               \n\t"\
00242         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00243         "paddd %%mm0, %%mm0             \n\t" \
00244         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00245         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00246         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00247         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00248         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00249         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00250         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00251         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00252         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00253         "psrad $" #shift ", %%mm7       \n\t"\
00254         "psrad $" #shift ", %%mm4       \n\t"\
00255         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00256         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00257         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00258         "psrad $" #shift ", %%mm1       \n\t"\
00259         "psrad $" #shift ", %%mm2       \n\t"\
00260         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00261         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00262         "movq %%mm7, " #dst "           \n\t"\
00263         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00264         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00265         "movq %%mm2, 24+" #dst "        \n\t"\
00266         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00267         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00268         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00269         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00270         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00271         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00272         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00273         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00274         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00275         "psrad $" #shift ", %%mm2       \n\t"\
00276         "psrad $" #shift ", %%mm0       \n\t"\
00277         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00278         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00279         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00280         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00281         "psrad $" #shift ", %%mm6       \n\t"\
00282         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00283         "movq %%mm2, 8+" #dst "         \n\t"\
00284         "psrad $" #shift ", %%mm4       \n\t"\
00285         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00286         "movq %%mm4, 16+" #dst "        \n\t"\
00287 
00288 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
00289         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00290         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00291         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00292         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00293         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00294         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00295         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00296         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00297         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00298         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00299         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00300         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00301         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00302         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00303         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00304         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00305         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00306         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00307         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00308         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00309         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00310         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00311         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00312         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
00313         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
00314         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00315         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00316         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00317         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00318         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
00319         "psrad $" #shift ", %%mm7       \n\t"\
00320         "psrad $" #shift ", %%mm4       \n\t"\
00321         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00322         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00323         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00324         "psrad $" #shift ", %%mm0       \n\t"\
00325         "psrad $" #shift ", %%mm2       \n\t"\
00326         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00327         "movd %%mm7, " #dst "           \n\t"\
00328         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00329         "movd %%mm0, 16+" #dst "        \n\t"\
00330         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00331         "movd %%mm2, 96+" #dst "        \n\t"\
00332         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00333         "movd %%mm4, 112+" #dst "       \n\t"\
00334         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
00335         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00336         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00337         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00338         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00339         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00340         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00341         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00342         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00343         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00344         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00345         "psrad $" #shift ", %%mm2       \n\t"\
00346         "psrad $" #shift ", %%mm5       \n\t"\
00347         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00348         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
00349         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00350         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00351         "psrad $" #shift ", %%mm6       \n\t"\
00352         "psrad $" #shift ", %%mm4       \n\t"\
00353         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00354         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00355         "movd %%mm2, 32+" #dst "        \n\t"\
00356         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00357         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00358         "movd %%mm6, 48+" #dst "        \n\t"\
00359         "movd %%mm4, 64+" #dst "        \n\t"\
00360         "movd %%mm5, 80+" #dst "        \n\t"\
00361 
00362 
00363 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00364         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00365         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00366         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00367         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00368         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
00369         "pand %%mm0, %%mm4              \n\t"\
00370         "por %%mm1, %%mm4               \n\t"\
00371         "por %%mm2, %%mm4               \n\t"\
00372         "por %%mm3, %%mm4               \n\t"\
00373         "packssdw %%mm4,%%mm4           \n\t"\
00374         "movd %%mm4, %%eax              \n\t"\
00375         "orl %%eax, %%eax               \n\t"\
00376         "jz 1f                          \n\t"\
00377         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00378         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00379         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00380         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00381         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00382         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00383         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00384         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00385         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00386         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00387         #rounder ", %%mm4               \n\t"\
00388         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00389         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00390         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00391         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00392         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00393         #rounder ", %%mm0               \n\t"\
00394         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00395         "paddd %%mm0, %%mm0             \n\t" \
00396         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00397         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00398         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00399         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00400         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00401         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00402         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00403         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00404         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00405         "psrad $" #shift ", %%mm7       \n\t"\
00406         "psrad $" #shift ", %%mm4       \n\t"\
00407         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00408         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00409         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00410         "psrad $" #shift ", %%mm1       \n\t"\
00411         "psrad $" #shift ", %%mm2       \n\t"\
00412         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00413         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00414         "movq %%mm7, " #dst "           \n\t"\
00415         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00416         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00417         "movq %%mm2, 24+" #dst "        \n\t"\
00418         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00419         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00420         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00421         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00422         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00423         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00424         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00425         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00426         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00427         "psrad $" #shift ", %%mm2       \n\t"\
00428         "psrad $" #shift ", %%mm0       \n\t"\
00429         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00430         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00431         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00432         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00433         "psrad $" #shift ", %%mm6       \n\t"\
00434         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00435         "movq %%mm2, 8+" #dst "         \n\t"\
00436         "psrad $" #shift ", %%mm4       \n\t"\
00437         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00438         "movq %%mm4, 16+" #dst "        \n\t"\
00439         "jmp 2f                         \n\t"\
00440         "1:                             \n\t"\
00441         "pslld $16, %%mm0               \n\t"\
00442         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
00443         "psrad $13, %%mm0               \n\t"\
00444         "packssdw %%mm0, %%mm0          \n\t"\
00445         "movq %%mm0, " #dst "           \n\t"\
00446         "movq %%mm0, 8+" #dst "         \n\t"\
00447         "movq %%mm0, 16+" #dst "        \n\t"\
00448         "movq %%mm0, 24+" #dst "        \n\t"\
00449         "2:                             \n\t"
00450 
00451 
00452 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
00453 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
00454 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
00455 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
00456 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
00457 
00458 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
00459 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
00460 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
00461 
00462 
00463 //IDCT(      src0,   src4,   src1,    src5,    dst, shift)
00464 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00465 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00466 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00467 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00468 
00469 #else
00470 
00471 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00472         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00473         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00474         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00475         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00476         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
00477         "pand %%mm0, %%mm4              \n\t"\
00478         "por %%mm1, %%mm4               \n\t"\
00479         "por %%mm2, %%mm4               \n\t"\
00480         "por %%mm3, %%mm4               \n\t"\
00481         "packssdw %%mm4,%%mm4           \n\t"\
00482         "movd %%mm4, %%eax              \n\t"\
00483         "orl %%eax, %%eax               \n\t"\
00484         "jz 1f                          \n\t"\
00485         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00486         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00487         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00488         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00489         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00490         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00491         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00492         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00493         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00494         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00495         #rounder ", %%mm4               \n\t"\
00496         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00497         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00498         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00499         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00500         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00501         #rounder ", %%mm0               \n\t"\
00502         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00503         "paddd %%mm0, %%mm0             \n\t" \
00504         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00505         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00506         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00507         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00508         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00509         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00510         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00511         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00512         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00513         "psrad $" #shift ", %%mm7       \n\t"\
00514         "psrad $" #shift ", %%mm4       \n\t"\
00515         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00516         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00517         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00518         "psrad $" #shift ", %%mm1       \n\t"\
00519         "psrad $" #shift ", %%mm2       \n\t"\
00520         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00521         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00522         "movq %%mm7, " #dst "           \n\t"\
00523         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00524         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00525         "movq %%mm2, 24+" #dst "        \n\t"\
00526         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00527         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00528         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00529         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00530         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00531         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00532         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00533         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00534         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00535         "psrad $" #shift ", %%mm2       \n\t"\
00536         "psrad $" #shift ", %%mm0       \n\t"\
00537         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00538         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00539         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00540         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00541         "psrad $" #shift ", %%mm6       \n\t"\
00542         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00543         "movq %%mm2, 8+" #dst "         \n\t"\
00544         "psrad $" #shift ", %%mm4       \n\t"\
00545         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00546         "movq %%mm4, 16+" #dst "        \n\t"\
00547         "jmp 2f                         \n\t"\
00548         "1:                             \n\t"\
00549         "pslld $16, %%mm0               \n\t"\
00550         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
00551         "psrad $13, %%mm0               \n\t"\
00552         "packssdw %%mm0, %%mm0          \n\t"\
00553         "movq %%mm0, " #dst "           \n\t"\
00554         "movq %%mm0, 8+" #dst "         \n\t"\
00555         "movq %%mm0, 16+" #dst "        \n\t"\
00556         "movq %%mm0, 24+" #dst "        \n\t"\
00557         "2:                             \n\t"
00558 
00559 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
00560         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00561         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00562         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00563         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00564         "movq %%mm0, %%mm4              \n\t"\
00565         "por %%mm1, %%mm4               \n\t"\
00566         "por %%mm2, %%mm4               \n\t"\
00567         "por %%mm3, %%mm4               \n\t"\
00568         "packssdw %%mm4,%%mm4           \n\t"\
00569         "movd %%mm4, %%eax              \n\t"\
00570         "orl %%eax, %%eax               \n\t"\
00571         "jz " #bt "                     \n\t"\
00572         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00573         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00574         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00575         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00576         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00577         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00578         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00579         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00580         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00581         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00582         #rounder ", %%mm4               \n\t"\
00583         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00584         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00585         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00586         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00587         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00588         #rounder ", %%mm0               \n\t"\
00589         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00590         "paddd %%mm0, %%mm0             \n\t" \
00591         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00592         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00593         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00594         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00595         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00596         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00597         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00598         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00599         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00600         "psrad $" #shift ", %%mm7       \n\t"\
00601         "psrad $" #shift ", %%mm4       \n\t"\
00602         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00603         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00604         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00605         "psrad $" #shift ", %%mm1       \n\t"\
00606         "psrad $" #shift ", %%mm2       \n\t"\
00607         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00608         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00609         "movq %%mm7, " #dst "           \n\t"\
00610         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00611         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00612         "movq %%mm2, 24+" #dst "        \n\t"\
00613         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00614         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00615         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00616         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00617         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00618         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00619         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00620         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00621         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00622         "psrad $" #shift ", %%mm2       \n\t"\
00623         "psrad $" #shift ", %%mm0       \n\t"\
00624         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00625         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00626         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00627         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00628         "psrad $" #shift ", %%mm6       \n\t"\
00629         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00630         "movq %%mm2, 8+" #dst "         \n\t"\
00631         "psrad $" #shift ", %%mm4       \n\t"\
00632         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00633         "movq %%mm4, 16+" #dst "        \n\t"\
00634 
00635 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00636         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00637         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00638         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00639         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00640         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00641         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00642         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00643         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00644         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00645         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00646         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00647         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00648         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00649         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00650         #rounder ", %%mm4               \n\t"\
00651         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00652         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00653         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00654         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
00655         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00656         #rounder ", %%mm0               \n\t"\
00657         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
00658         "paddd %%mm0, %%mm0             \n\t" \
00659         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
00660         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00661         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
00662         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
00663         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00664         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00665         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00666         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00667         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
00668         "psrad $" #shift ", %%mm7       \n\t"\
00669         "psrad $" #shift ", %%mm4       \n\t"\
00670         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
00671         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
00672         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00673         "psrad $" #shift ", %%mm1       \n\t"\
00674         "psrad $" #shift ", %%mm2       \n\t"\
00675         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
00676         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
00677         "movq %%mm7, " #dst "           \n\t"\
00678         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
00679         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00680         "movq %%mm2, 24+" #dst "        \n\t"\
00681         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00682         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00683         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00684         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00685         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
00686         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00687         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00688         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00689         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
00690         "psrad $" #shift ", %%mm2       \n\t"\
00691         "psrad $" #shift ", %%mm0       \n\t"\
00692         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00693         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
00694         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00695         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00696         "psrad $" #shift ", %%mm6       \n\t"\
00697         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
00698         "movq %%mm2, 8+" #dst "         \n\t"\
00699         "psrad $" #shift ", %%mm4       \n\t"\
00700         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
00701         "movq %%mm4, 16+" #dst "        \n\t"\
00702 
00703 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
00704 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
00705 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
00706 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
00707 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
00708 
00709 #undef IDCT
00710 #define IDCT(src0, src4, src1, src5, dst, shift) \
00711         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00712         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00713         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00714         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00715         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00716         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00717         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00718         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00719         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00720         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00721         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00722         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00723         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00724         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00725         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00726         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00727         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00728         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00729         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00730         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00731         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00732         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00733         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00734         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
00735         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
00736         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00737         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00738         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00739         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00740         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
00741         "psrad $" #shift ", %%mm7       \n\t"\
00742         "psrad $" #shift ", %%mm4       \n\t"\
00743         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00744         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00745         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00746         "psrad $" #shift ", %%mm0       \n\t"\
00747         "psrad $" #shift ", %%mm2       \n\t"\
00748         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00749         "movd %%mm7, " #dst "           \n\t"\
00750         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00751         "movd %%mm0, 16+" #dst "        \n\t"\
00752         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00753         "movd %%mm2, 96+" #dst "        \n\t"\
00754         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00755         "movd %%mm4, 112+" #dst "       \n\t"\
00756         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
00757         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00758         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00759         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00760         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00761         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00762         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00763         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00764         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00765         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00766         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00767         "psrad $" #shift ", %%mm2       \n\t"\
00768         "psrad $" #shift ", %%mm5       \n\t"\
00769         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00770         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
00771         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00772         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00773         "psrad $" #shift ", %%mm6       \n\t"\
00774         "psrad $" #shift ", %%mm4       \n\t"\
00775         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00776         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00777         "movd %%mm2, 32+" #dst "        \n\t"\
00778         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00779         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00780         "movd %%mm6, 48+" #dst "        \n\t"\
00781         "movd %%mm4, 64+" #dst "        \n\t"\
00782         "movd %%mm5, 80+" #dst "        \n\t"
00783 
00784 
00785 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00786 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00787 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00788 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00789 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00790         "jmp 9f                         \n\t"
00791 
00792         "#" ASMALIGN(4)                      \
00793         "4:                             \n\t"
00794 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
00795 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
00796 
00797 #undef IDCT
00798 #define IDCT(src0, src4, src1, src5, dst, shift) \
00799         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00800         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
00801         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00802         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00803         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00804         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00805         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00806         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
00807         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
00808         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
00809         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
00810         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00811         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
00812         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
00813         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00814         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
00815         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
00816         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00817         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00818         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
00819         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00820         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
00821         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00822         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00823         "psrad $" #shift ", %%mm1       \n\t"\
00824         "psrad $" #shift ", %%mm4       \n\t"\
00825         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00826         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00827         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00828         "psrad $" #shift ", %%mm0       \n\t"\
00829         "psrad $" #shift ", %%mm2       \n\t"\
00830         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
00831         "movd %%mm1, " #dst "           \n\t"\
00832         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00833         "movd %%mm0, 16+" #dst "        \n\t"\
00834         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00835         "movd %%mm2, 96+" #dst "        \n\t"\
00836         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00837         "movd %%mm4, 112+" #dst "       \n\t"\
00838         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
00839         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00840         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00841         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00842         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00843         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00844         "psrad $" #shift ", %%mm2       \n\t"\
00845         "psrad $" #shift ", %%mm5       \n\t"\
00846         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
00847         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00848         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
00849         "psrad $" #shift ", %%mm6       \n\t"\
00850         "psrad $" #shift ", %%mm1       \n\t"\
00851         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00852         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00853         "movd %%mm2, 32+" #dst "        \n\t"\
00854         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
00855         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00856         "movd %%mm6, 48+" #dst "        \n\t"\
00857         "movd %%mm1, 64+" #dst "        \n\t"\
00858         "movd %%mm5, 80+" #dst "        \n\t"
00859 
00860 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00861 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00862 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00863 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00864 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00865         "jmp 9f                         \n\t"
00866 
00867         "#" ASMALIGN(4)                      \
00868         "6:                             \n\t"
00869 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
00870 
00871 #undef IDCT
00872 #define IDCT(src0, src4, src1, src5, dst, shift) \
00873         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00874         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00875         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00876         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00877         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00878         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00879         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00880         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00881         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00882         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00883         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
00884         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00885         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
00886         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00887         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00888         "psrad $" #shift ", %%mm1       \n\t"\
00889         "psrad $" #shift ", %%mm4       \n\t"\
00890         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00891         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00892         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00893         "psrad $" #shift ", %%mm0       \n\t"\
00894         "psrad $" #shift ", %%mm2       \n\t"\
00895         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
00896         "movd %%mm1, " #dst "           \n\t"\
00897         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00898         "movd %%mm0, 16+" #dst "        \n\t"\
00899         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00900         "movd %%mm2, 96+" #dst "        \n\t"\
00901         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00902         "movd %%mm4, 112+" #dst "       \n\t"\
00903         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
00904         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00905         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00906         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00907         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00908         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00909         "psrad $" #shift ", %%mm2       \n\t"\
00910         "psrad $" #shift ", %%mm5       \n\t"\
00911         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
00912         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00913         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
00914         "psrad $" #shift ", %%mm6       \n\t"\
00915         "psrad $" #shift ", %%mm1       \n\t"\
00916         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00917         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00918         "movd %%mm2, 32+" #dst "        \n\t"\
00919         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
00920         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00921         "movd %%mm6, 48+" #dst "        \n\t"\
00922         "movd %%mm1, 64+" #dst "        \n\t"\
00923         "movd %%mm5, 80+" #dst "        \n\t"
00924 
00925 
00926 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
00927 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00928 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00929 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00930 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00931         "jmp 9f                         \n\t"
00932 
00933         "#" ASMALIGN(4)                      \
00934         "2:                             \n\t"
00935 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
00936 
00937 #undef IDCT
00938 #define IDCT(src0, src4, src1, src5, dst, shift) \
00939         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
00940         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
00941         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
00942         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
00943         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00944         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
00945         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00946         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
00947         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
00948         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
00949         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
00950         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
00951         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
00952         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
00953         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
00954         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
00955         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
00956         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
00957         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
00958         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
00959         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
00960         "psrad $" #shift ", %%mm7       \n\t"\
00961         "psrad $" #shift ", %%mm4       \n\t"\
00962         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
00963         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
00964         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
00965         "psrad $" #shift ", %%mm0       \n\t"\
00966         "psrad $" #shift ", %%mm2       \n\t"\
00967         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
00968         "movd %%mm7, " #dst "           \n\t"\
00969         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
00970         "movd %%mm0, 16+" #dst "        \n\t"\
00971         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
00972         "movd %%mm2, 96+" #dst "        \n\t"\
00973         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
00974         "movd %%mm4, 112+" #dst "       \n\t"\
00975         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
00976         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
00977         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
00978         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
00979         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
00980         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
00981         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
00982         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
00983         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
00984         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
00985         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
00986         "psrad $" #shift ", %%mm2       \n\t"\
00987         "psrad $" #shift ", %%mm5       \n\t"\
00988         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
00989         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
00990         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
00991         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
00992         "psrad $" #shift ", %%mm6       \n\t"\
00993         "psrad $" #shift ", %%mm4       \n\t"\
00994         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
00995         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
00996         "movd %%mm2, 32+" #dst "        \n\t"\
00997         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
00998         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
00999         "movd %%mm6, 48+" #dst "        \n\t"\
01000         "movd %%mm4, 64+" #dst "        \n\t"\
01001         "movd %%mm5, 80+" #dst "        \n\t"
01002 
01003 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01004 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01005 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01006 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01007 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01008         "jmp 9f                         \n\t"
01009 
01010         "#" ASMALIGN(4)                      \
01011         "3:                             \n\t"
01012 #undef IDCT
01013 #define IDCT(src0, src4, src1, src5, dst, shift) \
01014         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
01015         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
01016         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
01017         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01018         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
01019         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01020         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01021         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
01022         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
01023         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01024         "movq 64(%2), %%mm3             \n\t"\
01025         "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
01026         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
01027         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
01028         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
01029         "psrad $" #shift ", %%mm7       \n\t"\
01030         "psrad $" #shift ", %%mm4       \n\t"\
01031         "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
01032         "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
01033         "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
01034         "psrad $" #shift ", %%mm0       \n\t"\
01035         "psrad $" #shift ", %%mm1       \n\t"\
01036         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
01037         "movd %%mm7, " #dst "           \n\t"\
01038         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
01039         "movd %%mm0, 16+" #dst "        \n\t"\
01040         "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
01041         "movd %%mm1, 96+" #dst "        \n\t"\
01042         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
01043         "movd %%mm4, 112+" #dst "       \n\t"\
01044         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
01045         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
01046         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
01047         "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
01048         "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
01049         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
01050         "psrad $" #shift ", %%mm1       \n\t"\
01051         "psrad $" #shift ", %%mm5       \n\t"\
01052         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
01053         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
01054         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
01055         "psrad $" #shift ", %%mm6       \n\t"\
01056         "psrad $" #shift ", %%mm4       \n\t"\
01057         "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
01058         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
01059         "movd %%mm1, 32+" #dst "        \n\t"\
01060         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
01061         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
01062         "movd %%mm6, 48+" #dst "        \n\t"\
01063         "movd %%mm4, 64+" #dst "        \n\t"\
01064         "movd %%mm5, 80+" #dst "        \n\t"
01065 
01066 
01067 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01068 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01069 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01070 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01071 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01072         "jmp 9f                         \n\t"
01073 
01074         "#" ASMALIGN(4)                      \
01075         "5:                             \n\t"
01076 #undef IDCT
01077 #define IDCT(src0, src4, src1, src5, dst, shift) \
01078         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
01079         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
01080         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
01081         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01082         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
01083         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01084         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
01085         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
01086         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
01087         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
01088         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01089         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
01090         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
01091         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01092         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
01093         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
01094         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
01095         "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
01096         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
01097         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01098         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
01099         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01100         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
01101         "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
01102         "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
01103         "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
01104         "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
01105         "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
01106         "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
01107         "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
01108         "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
01109         "psrad $" #shift ", %%mm4       \n\t"\
01110         "psrad $" #shift ", %%mm7       \n\t"\
01111         "psrad $" #shift ", %%mm3       \n\t"\
01112         "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
01113         "movq %%mm4, " #dst "           \n\t"\
01114         "psrad $" #shift ", %%mm0       \n\t"\
01115         "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
01116         "movq %%mm0, 16+" #dst "        \n\t"\
01117         "movq %%mm0, 96+" #dst "        \n\t"\
01118         "movq %%mm4, 112+" #dst "       \n\t"\
01119         "psrad $" #shift ", %%mm5       \n\t"\
01120         "psrad $" #shift ", %%mm6       \n\t"\
01121         "psrad $" #shift ", %%mm2       \n\t"\
01122         "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
01123         "movq %%mm5, 32+" #dst "        \n\t"\
01124         "psrad $" #shift ", %%mm1       \n\t"\
01125         "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
01126         "movq %%mm6, 48+" #dst "        \n\t"\
01127         "movq %%mm6, 64+" #dst "        \n\t"\
01128         "movq %%mm5, 80+" #dst "        \n\t"
01129 
01130 
01131 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01132 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01133 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01134 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01135 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01136         "jmp 9f                         \n\t"
01137 
01138 
01139         "#" ASMALIGN(4)                      \
01140         "1:                             \n\t"
01141 #undef IDCT
01142 #define IDCT(src0, src4, src1, src5, dst, shift) \
01143         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
01144         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
01145         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
01146         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
01147         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01148         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
01149         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01150         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
01151         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
01152         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
01153         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
01154         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01155         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
01156         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
01157         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
01158         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
01159         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01160         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
01161         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
01162         "movq 64(%2), %%mm1             \n\t"\
01163         "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
01164         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
01165         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
01166         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
01167         "psrad $" #shift ", %%mm7       \n\t"\
01168         "psrad $" #shift ", %%mm4       \n\t"\
01169         "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
01170         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
01171         "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
01172         "psrad $" #shift ", %%mm0       \n\t"\
01173         "psrad $" #shift ", %%mm3       \n\t"\
01174         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
01175         "movd %%mm7, " #dst "           \n\t"\
01176         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
01177         "movd %%mm0, 16+" #dst "        \n\t"\
01178         "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
01179         "movd %%mm3, 96+" #dst "        \n\t"\
01180         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
01181         "movd %%mm4, 112+" #dst "       \n\t"\
01182         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
01183         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
01184         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
01185         "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
01186         "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
01187         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
01188         "psrad $" #shift ", %%mm3       \n\t"\
01189         "psrad $" #shift ", %%mm5       \n\t"\
01190         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
01191         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
01192         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
01193         "psrad $" #shift ", %%mm6       \n\t"\
01194         "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
01195         "movd %%mm3, 32+" #dst "        \n\t"\
01196         "psrad $" #shift ", %%mm4       \n\t"\
01197         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
01198         "movd %%mm6, 48+" #dst "        \n\t"\
01199         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
01200         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
01201         "movd %%mm4, 64+" #dst "        \n\t"\
01202         "movd %%mm5, 80+" #dst "        \n\t"
01203 
01204 
01205 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01206 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01207 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01208 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01209 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01210         "jmp 9f                         \n\t"
01211 
01212 
01213         "#" ASMALIGN(4)
01214         "7:                             \n\t"
01215 #undef IDCT
01216 #define IDCT(src0, src4, src1, src5, dst, shift) \
01217         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
01218         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
01219         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01220         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
01221         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01222         "psrad $" #shift ", %%mm4       \n\t"\
01223         "psrad $" #shift ", %%mm0       \n\t"\
01224         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
01225         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
01226         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
01227         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
01228         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
01229         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
01230         "psrad $" #shift ", %%mm1       \n\t"\
01231         "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
01232         "movq %%mm4, " #dst "           \n\t"\
01233         "psrad $" #shift ", %%mm2       \n\t"\
01234         "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
01235         "movq %%mm0, 16+" #dst "        \n\t"\
01236         "movq %%mm0, 96+" #dst "        \n\t"\
01237         "movq %%mm4, 112+" #dst "       \n\t"\
01238         "movq %%mm0, 32+" #dst "        \n\t"\
01239         "movq %%mm4, 48+" #dst "        \n\t"\
01240         "movq %%mm4, 64+" #dst "        \n\t"\
01241         "movq %%mm0, 80+" #dst "        \n\t"
01242 
01243 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
01244 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01245 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01246 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01247 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01248 
01249 
01250 #endif
01251 
01252 /*
01253 Input
01254  00 40 04 44 20 60 24 64
01255  10 30 14 34 50 70 54 74
01256  01 41 03 43 21 61 23 63
01257  11 31 13 33 51 71 53 73
01258  02 42 06 46 22 62 26 66
01259  12 32 16 36 52 72 56 76
01260  05 45 07 47 25 65 27 67
01261  15 35 17 37 55 75 57 77
01262 
01263 Temp
01264  00 04 10 14 20 24 30 34
01265  40 44 50 54 60 64 70 74
01266  01 03 11 13 21 23 31 33
01267  41 43 51 53 61 63 71 73
01268  02 06 12 16 22 26 32 36
01269  42 46 52 56 62 66 72 76
01270  05 07 15 17 25 27 35 37
01271  45 47 55 57 65 67 75 77
01272 */
01273 
01274 "9: \n\t"
01275                 :: "r" (block), "r" (temp), "r" (coeffs)
01276                 : "%eax"
01277         );
01278 }
01279 
01280 void ff_simple_idct_mmx(int16_t *block)
01281 {
01282     idct(block);
01283 }
01284 
01285 //FIXME merge add/put into the idct
01286 
01287 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01288 {
01289     idct(block);
01290     put_pixels_clamped_mmx(block, dest, line_size);
01291 }
01292 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01293 {
01294     idct(block);
01295     add_pixels_clamped_mmx(block, dest, line_size);
01296 }

Generated on Fri Sep 16 2011 17:17:46 for FFmpeg by  doxygen 1.7.1