00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/h264dsp.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/simple_idct.h"
00031 #include "libavcodec/ac3dec.h"
00032 #include "dsputil_mmx.h"
00033 #include "idct_xvid.h"
00034
00035
00036
00037
00038
00039 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
00040 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00041
00042 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
00043 {0x8000000080000000ULL, 0x8000000080000000ULL};
00044
00045 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL};
00046 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
00047 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
00048 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
00049 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
00050 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
00051 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
00052 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
00053 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
00054 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
00055 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
00056 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
00057 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
00058 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
00059 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
00060 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
00061 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
00062 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
00063 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
00064 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
00065 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00066 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00067 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
00068 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
00069
00070 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
00071 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
00072 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
00073 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
00074 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
00075 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
00076 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
00077 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
00078 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
00079 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
00080 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
00081 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
00082 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
00083
00084 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
00085 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
00086
00087 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
00088 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
00089
00090 #define MOVQ_BFE(regd) \
00091 __asm__ volatile ( \
00092 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00093 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00094
00095 #ifndef PIC
00096 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
00097 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
00098 #else
00099
00100
00101 #define MOVQ_BONE(regd) \
00102 __asm__ volatile ( \
00103 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00104 "psrlw $15, %%" #regd " \n\t" \
00105 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00106
00107 #define MOVQ_WTWO(regd) \
00108 __asm__ volatile ( \
00109 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00110 "psrlw $15, %%" #regd " \n\t" \
00111 "psllw $1, %%" #regd " \n\t"::)
00112
00113 #endif
00114
00115
00116
00117
00118 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00119 "movq " #rega ", " #regr " \n\t"\
00120 "pand " #regb ", " #regr " \n\t"\
00121 "pxor " #rega ", " #regb " \n\t"\
00122 "pand " #regfe "," #regb " \n\t"\
00123 "psrlq $1, " #regb " \n\t"\
00124 "paddb " #regb ", " #regr " \n\t"
00125
00126 #define PAVGB_MMX(rega, regb, regr, regfe) \
00127 "movq " #rega ", " #regr " \n\t"\
00128 "por " #regb ", " #regr " \n\t"\
00129 "pxor " #rega ", " #regb " \n\t"\
00130 "pand " #regfe "," #regb " \n\t"\
00131 "psrlq $1, " #regb " \n\t"\
00132 "psubb " #regb ", " #regr " \n\t"
00133
00134
00135 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00136 "movq " #rega ", " #regr " \n\t"\
00137 "movq " #regc ", " #regp " \n\t"\
00138 "pand " #regb ", " #regr " \n\t"\
00139 "pand " #regd ", " #regp " \n\t"\
00140 "pxor " #rega ", " #regb " \n\t"\
00141 "pxor " #regc ", " #regd " \n\t"\
00142 "pand %%mm6, " #regb " \n\t"\
00143 "pand %%mm6, " #regd " \n\t"\
00144 "psrlq $1, " #regb " \n\t"\
00145 "psrlq $1, " #regd " \n\t"\
00146 "paddb " #regb ", " #regr " \n\t"\
00147 "paddb " #regd ", " #regp " \n\t"
00148
00149 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00150 "movq " #rega ", " #regr " \n\t"\
00151 "movq " #regc ", " #regp " \n\t"\
00152 "por " #regb ", " #regr " \n\t"\
00153 "por " #regd ", " #regp " \n\t"\
00154 "pxor " #rega ", " #regb " \n\t"\
00155 "pxor " #regc ", " #regd " \n\t"\
00156 "pand %%mm6, " #regb " \n\t"\
00157 "pand %%mm6, " #regd " \n\t"\
00158 "psrlq $1, " #regd " \n\t"\
00159 "psrlq $1, " #regb " \n\t"\
00160 "psubb " #regb ", " #regr " \n\t"\
00161 "psubb " #regd ", " #regp " \n\t"
00162
00163
00164
00165 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00166 #define SET_RND MOVQ_WONE
00167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
00170
00171 #include "dsputil_mmx_rnd_template.c"
00172
00173 #undef DEF
00174 #undef SET_RND
00175 #undef PAVGBP
00176 #undef PAVGB
00177
00178
00179
00180 #define DEF(x, y) x ## _ ## y ##_mmx
00181 #define SET_RND MOVQ_WTWO
00182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00184
00185 #include "dsputil_mmx_rnd_template.c"
00186
00187 #undef DEF
00188 #undef SET_RND
00189 #undef PAVGBP
00190 #undef PAVGB
00191 #undef OP_AVG
00192
00193
00194
00195
00196 #define DEF(x) x ## _3dnow
00197 #define PAVGB "pavgusb"
00198 #define OP_AVG PAVGB
00199
00200 #include "dsputil_mmx_avg_template.c"
00201
00202 #undef DEF
00203 #undef PAVGB
00204 #undef OP_AVG
00205
00206
00207
00208
00209 #define DEF(x) x ## _mmx2
00210
00211
00212 #define PAVGB "pavgb"
00213 #define OP_AVG PAVGB
00214
00215 #include "dsputil_mmx_avg_template.c"
00216
00217 #undef DEF
00218 #undef PAVGB
00219 #undef OP_AVG
00220
00221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00223 #define put_pixels16_mmx2 put_pixels16_mmx
00224 #define put_pixels8_mmx2 put_pixels8_mmx
00225 #define put_pixels4_mmx2 put_pixels4_mmx
00226 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00227 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00228 #define put_pixels16_3dnow put_pixels16_mmx
00229 #define put_pixels8_3dnow put_pixels8_mmx
00230 #define put_pixels4_3dnow put_pixels4_mmx
00231 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00232 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00233
00234
00235
00236
00237 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00238 {
00239 const DCTELEM *p;
00240 uint8_t *pix;
00241
00242
00243 p = block;
00244 pix = pixels;
00245
00246 __asm__ volatile(
00247 "movq %3, %%mm0 \n\t"
00248 "movq 8%3, %%mm1 \n\t"
00249 "movq 16%3, %%mm2 \n\t"
00250 "movq 24%3, %%mm3 \n\t"
00251 "movq 32%3, %%mm4 \n\t"
00252 "movq 40%3, %%mm5 \n\t"
00253 "movq 48%3, %%mm6 \n\t"
00254 "movq 56%3, %%mm7 \n\t"
00255 "packuswb %%mm1, %%mm0 \n\t"
00256 "packuswb %%mm3, %%mm2 \n\t"
00257 "packuswb %%mm5, %%mm4 \n\t"
00258 "packuswb %%mm7, %%mm6 \n\t"
00259 "movq %%mm0, (%0) \n\t"
00260 "movq %%mm2, (%0, %1) \n\t"
00261 "movq %%mm4, (%0, %1, 2) \n\t"
00262 "movq %%mm6, (%0, %2) \n\t"
00263 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
00264 :"memory");
00265 pix += line_size*4;
00266 p += 32;
00267
00268
00269
00270
00271 __asm__ volatile(
00272 "movq (%3), %%mm0 \n\t"
00273 "movq 8(%3), %%mm1 \n\t"
00274 "movq 16(%3), %%mm2 \n\t"
00275 "movq 24(%3), %%mm3 \n\t"
00276 "movq 32(%3), %%mm4 \n\t"
00277 "movq 40(%3), %%mm5 \n\t"
00278 "movq 48(%3), %%mm6 \n\t"
00279 "movq 56(%3), %%mm7 \n\t"
00280 "packuswb %%mm1, %%mm0 \n\t"
00281 "packuswb %%mm3, %%mm2 \n\t"
00282 "packuswb %%mm5, %%mm4 \n\t"
00283 "packuswb %%mm7, %%mm6 \n\t"
00284 "movq %%mm0, (%0) \n\t"
00285 "movq %%mm2, (%0, %1) \n\t"
00286 "movq %%mm4, (%0, %1, 2) \n\t"
00287 "movq %%mm6, (%0, %2) \n\t"
00288 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
00289 :"memory");
00290 }
00291
00292 #define put_signed_pixels_clamped_mmx_half(off) \
00293 "movq "#off"(%2), %%mm1 \n\t"\
00294 "movq 16+"#off"(%2), %%mm2 \n\t"\
00295 "movq 32+"#off"(%2), %%mm3 \n\t"\
00296 "movq 48+"#off"(%2), %%mm4 \n\t"\
00297 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
00298 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
00299 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
00300 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
00301 "paddb %%mm0, %%mm1 \n\t"\
00302 "paddb %%mm0, %%mm2 \n\t"\
00303 "paddb %%mm0, %%mm3 \n\t"\
00304 "paddb %%mm0, %%mm4 \n\t"\
00305 "movq %%mm1, (%0) \n\t"\
00306 "movq %%mm2, (%0, %3) \n\t"\
00307 "movq %%mm3, (%0, %3, 2) \n\t"\
00308 "movq %%mm4, (%0, %1) \n\t"
00309
00310 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00311 {
00312 x86_reg line_skip = line_size;
00313 x86_reg line_skip3;
00314
00315 __asm__ volatile (
00316 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
00317 "lea (%3, %3, 2), %1 \n\t"
00318 put_signed_pixels_clamped_mmx_half(0)
00319 "lea (%0, %3, 4), %0 \n\t"
00320 put_signed_pixels_clamped_mmx_half(64)
00321 :"+&r" (pixels), "=&r" (line_skip3)
00322 :"r" (block), "r"(line_skip)
00323 :"memory");
00324 }
00325
00326 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00327 {
00328 const DCTELEM *p;
00329 uint8_t *pix;
00330 int i;
00331
00332
00333 p = block;
00334 pix = pixels;
00335 MOVQ_ZERO(mm7);
00336 i = 4;
00337 do {
00338 __asm__ volatile(
00339 "movq (%2), %%mm0 \n\t"
00340 "movq 8(%2), %%mm1 \n\t"
00341 "movq 16(%2), %%mm2 \n\t"
00342 "movq 24(%2), %%mm3 \n\t"
00343 "movq %0, %%mm4 \n\t"
00344 "movq %1, %%mm6 \n\t"
00345 "movq %%mm4, %%mm5 \n\t"
00346 "punpcklbw %%mm7, %%mm4 \n\t"
00347 "punpckhbw %%mm7, %%mm5 \n\t"
00348 "paddsw %%mm4, %%mm0 \n\t"
00349 "paddsw %%mm5, %%mm1 \n\t"
00350 "movq %%mm6, %%mm5 \n\t"
00351 "punpcklbw %%mm7, %%mm6 \n\t"
00352 "punpckhbw %%mm7, %%mm5 \n\t"
00353 "paddsw %%mm6, %%mm2 \n\t"
00354 "paddsw %%mm5, %%mm3 \n\t"
00355 "packuswb %%mm1, %%mm0 \n\t"
00356 "packuswb %%mm3, %%mm2 \n\t"
00357 "movq %%mm0, %0 \n\t"
00358 "movq %%mm2, %1 \n\t"
00359 :"+m"(*pix), "+m"(*(pix+line_size))
00360 :"r"(p)
00361 :"memory");
00362 pix += line_size*2;
00363 p += 16;
00364 } while (--i);
00365 }
00366
00367 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00368 {
00369 __asm__ volatile(
00370 "lea (%3, %3), %%"REG_a" \n\t"
00371 ".p2align 3 \n\t"
00372 "1: \n\t"
00373 "movd (%1), %%mm0 \n\t"
00374 "movd (%1, %3), %%mm1 \n\t"
00375 "movd %%mm0, (%2) \n\t"
00376 "movd %%mm1, (%2, %3) \n\t"
00377 "add %%"REG_a", %1 \n\t"
00378 "add %%"REG_a", %2 \n\t"
00379 "movd (%1), %%mm0 \n\t"
00380 "movd (%1, %3), %%mm1 \n\t"
00381 "movd %%mm0, (%2) \n\t"
00382 "movd %%mm1, (%2, %3) \n\t"
00383 "add %%"REG_a", %1 \n\t"
00384 "add %%"REG_a", %2 \n\t"
00385 "subl $4, %0 \n\t"
00386 "jnz 1b \n\t"
00387 : "+g"(h), "+r" (pixels), "+r" (block)
00388 : "r"((x86_reg)line_size)
00389 : "%"REG_a, "memory"
00390 );
00391 }
00392
00393 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00394 {
00395 __asm__ volatile(
00396 "lea (%3, %3), %%"REG_a" \n\t"
00397 ".p2align 3 \n\t"
00398 "1: \n\t"
00399 "movq (%1), %%mm0 \n\t"
00400 "movq (%1, %3), %%mm1 \n\t"
00401 "movq %%mm0, (%2) \n\t"
00402 "movq %%mm1, (%2, %3) \n\t"
00403 "add %%"REG_a", %1 \n\t"
00404 "add %%"REG_a", %2 \n\t"
00405 "movq (%1), %%mm0 \n\t"
00406 "movq (%1, %3), %%mm1 \n\t"
00407 "movq %%mm0, (%2) \n\t"
00408 "movq %%mm1, (%2, %3) \n\t"
00409 "add %%"REG_a", %1 \n\t"
00410 "add %%"REG_a", %2 \n\t"
00411 "subl $4, %0 \n\t"
00412 "jnz 1b \n\t"
00413 : "+g"(h), "+r" (pixels), "+r" (block)
00414 : "r"((x86_reg)line_size)
00415 : "%"REG_a, "memory"
00416 );
00417 }
00418
00419 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00420 {
00421 __asm__ volatile(
00422 "lea (%3, %3), %%"REG_a" \n\t"
00423 ".p2align 3 \n\t"
00424 "1: \n\t"
00425 "movq (%1), %%mm0 \n\t"
00426 "movq 8(%1), %%mm4 \n\t"
00427 "movq (%1, %3), %%mm1 \n\t"
00428 "movq 8(%1, %3), %%mm5 \n\t"
00429 "movq %%mm0, (%2) \n\t"
00430 "movq %%mm4, 8(%2) \n\t"
00431 "movq %%mm1, (%2, %3) \n\t"
00432 "movq %%mm5, 8(%2, %3) \n\t"
00433 "add %%"REG_a", %1 \n\t"
00434 "add %%"REG_a", %2 \n\t"
00435 "movq (%1), %%mm0 \n\t"
00436 "movq 8(%1), %%mm4 \n\t"
00437 "movq (%1, %3), %%mm1 \n\t"
00438 "movq 8(%1, %3), %%mm5 \n\t"
00439 "movq %%mm0, (%2) \n\t"
00440 "movq %%mm4, 8(%2) \n\t"
00441 "movq %%mm1, (%2, %3) \n\t"
00442 "movq %%mm5, 8(%2, %3) \n\t"
00443 "add %%"REG_a", %1 \n\t"
00444 "add %%"REG_a", %2 \n\t"
00445 "subl $4, %0 \n\t"
00446 "jnz 1b \n\t"
00447 : "+g"(h), "+r" (pixels), "+r" (block)
00448 : "r"((x86_reg)line_size)
00449 : "%"REG_a, "memory"
00450 );
00451 }
00452
00453 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00454 {
00455 __asm__ volatile(
00456 "1: \n\t"
00457 "movdqu (%1), %%xmm0 \n\t"
00458 "movdqu (%1,%3), %%xmm1 \n\t"
00459 "movdqu (%1,%3,2), %%xmm2 \n\t"
00460 "movdqu (%1,%4), %%xmm3 \n\t"
00461 "lea (%1,%3,4), %1 \n\t"
00462 "movdqa %%xmm0, (%2) \n\t"
00463 "movdqa %%xmm1, (%2,%3) \n\t"
00464 "movdqa %%xmm2, (%2,%3,2) \n\t"
00465 "movdqa %%xmm3, (%2,%4) \n\t"
00466 "subl $4, %0 \n\t"
00467 "lea (%2,%3,4), %2 \n\t"
00468 "jnz 1b \n\t"
00469 : "+g"(h), "+r" (pixels), "+r" (block)
00470 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00471 : "memory"
00472 );
00473 }
00474
00475 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00476 {
00477 __asm__ volatile(
00478 "1: \n\t"
00479 "movdqu (%1), %%xmm0 \n\t"
00480 "movdqu (%1,%3), %%xmm1 \n\t"
00481 "movdqu (%1,%3,2), %%xmm2 \n\t"
00482 "movdqu (%1,%4), %%xmm3 \n\t"
00483 "lea (%1,%3,4), %1 \n\t"
00484 "pavgb (%2), %%xmm0 \n\t"
00485 "pavgb (%2,%3), %%xmm1 \n\t"
00486 "pavgb (%2,%3,2), %%xmm2 \n\t"
00487 "pavgb (%2,%4), %%xmm3 \n\t"
00488 "movdqa %%xmm0, (%2) \n\t"
00489 "movdqa %%xmm1, (%2,%3) \n\t"
00490 "movdqa %%xmm2, (%2,%3,2) \n\t"
00491 "movdqa %%xmm3, (%2,%4) \n\t"
00492 "subl $4, %0 \n\t"
00493 "lea (%2,%3,4), %2 \n\t"
00494 "jnz 1b \n\t"
00495 : "+g"(h), "+r" (pixels), "+r" (block)
00496 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00497 : "memory"
00498 );
00499 }
00500
00501 #define CLEAR_BLOCKS(name,n) \
00502 static void name(DCTELEM *blocks)\
00503 {\
00504 __asm__ volatile(\
00505 "pxor %%mm7, %%mm7 \n\t"\
00506 "mov %1, %%"REG_a" \n\t"\
00507 "1: \n\t"\
00508 "movq %%mm7, (%0, %%"REG_a") \n\t"\
00509 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
00510 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
00511 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
00512 "add $32, %%"REG_a" \n\t"\
00513 " js 1b \n\t"\
00514 : : "r" (((uint8_t *)blocks)+128*n),\
00515 "i" (-128*n)\
00516 : "%"REG_a\
00517 );\
00518 }
00519 CLEAR_BLOCKS(clear_blocks_mmx, 6)
00520 CLEAR_BLOCKS(clear_block_mmx, 1)
00521
00522 static void clear_block_sse(DCTELEM *block)
00523 {
00524 __asm__ volatile(
00525 "xorps %%xmm0, %%xmm0 \n"
00526 "movaps %%xmm0, (%0) \n"
00527 "movaps %%xmm0, 16(%0) \n"
00528 "movaps %%xmm0, 32(%0) \n"
00529 "movaps %%xmm0, 48(%0) \n"
00530 "movaps %%xmm0, 64(%0) \n"
00531 "movaps %%xmm0, 80(%0) \n"
00532 "movaps %%xmm0, 96(%0) \n"
00533 "movaps %%xmm0, 112(%0) \n"
00534 :: "r"(block)
00535 : "memory"
00536 );
00537 }
00538
00539 static void clear_blocks_sse(DCTELEM *blocks)
00540 {\
00541 __asm__ volatile(
00542 "xorps %%xmm0, %%xmm0 \n"
00543 "mov %1, %%"REG_a" \n"
00544 "1: \n"
00545 "movaps %%xmm0, (%0, %%"REG_a") \n"
00546 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
00547 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
00548 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
00549 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
00550 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
00551 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
00552 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
00553 "add $128, %%"REG_a" \n"
00554 " js 1b \n"
00555 : : "r" (((uint8_t *)blocks)+128*6),
00556 "i" (-128*6)
00557 : "%"REG_a
00558 );
00559 }
00560
00561 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00562 x86_reg i=0;
00563 __asm__ volatile(
00564 "jmp 2f \n\t"
00565 "1: \n\t"
00566 "movq (%1, %0), %%mm0 \n\t"
00567 "movq (%2, %0), %%mm1 \n\t"
00568 "paddb %%mm0, %%mm1 \n\t"
00569 "movq %%mm1, (%2, %0) \n\t"
00570 "movq 8(%1, %0), %%mm0 \n\t"
00571 "movq 8(%2, %0), %%mm1 \n\t"
00572 "paddb %%mm0, %%mm1 \n\t"
00573 "movq %%mm1, 8(%2, %0) \n\t"
00574 "add $16, %0 \n\t"
00575 "2: \n\t"
00576 "cmp %3, %0 \n\t"
00577 " js 1b \n\t"
00578 : "+r" (i)
00579 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
00580 );
00581 for(; i<w; i++)
00582 dst[i+0] += src[i+0];
00583 }
00584
00585 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00586 x86_reg i=0;
00587 __asm__ volatile(
00588 "jmp 2f \n\t"
00589 "1: \n\t"
00590 "movq (%2, %0), %%mm0 \n\t"
00591 "movq 8(%2, %0), %%mm1 \n\t"
00592 "paddb (%3, %0), %%mm0 \n\t"
00593 "paddb 8(%3, %0), %%mm1 \n\t"
00594 "movq %%mm0, (%1, %0) \n\t"
00595 "movq %%mm1, 8(%1, %0) \n\t"
00596 "add $16, %0 \n\t"
00597 "2: \n\t"
00598 "cmp %4, %0 \n\t"
00599 " js 1b \n\t"
00600 : "+r" (i)
00601 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
00602 );
00603 for(; i<w; i++)
00604 dst[i] = src1[i] + src2[i];
00605 }
00606
00607 #if HAVE_7REGS
00608 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
00609 x86_reg w2 = -w;
00610 x86_reg x;
00611 int l = *left & 0xff;
00612 int tl = *left_top & 0xff;
00613 int t;
00614 __asm__ volatile(
00615 "mov %7, %3 \n"
00616 "1: \n"
00617 "movzbl (%3,%4), %2 \n"
00618 "mov %2, %k3 \n"
00619 "sub %b1, %b3 \n"
00620 "add %b0, %b3 \n"
00621 "mov %2, %1 \n"
00622 "cmp %0, %2 \n"
00623 "cmovg %0, %2 \n"
00624 "cmovg %1, %0 \n"
00625 "cmp %k3, %0 \n"
00626 "cmovg %k3, %0 \n"
00627 "mov %7, %3 \n"
00628 "cmp %2, %0 \n"
00629 "cmovl %2, %0 \n"
00630 "add (%6,%4), %b0 \n"
00631 "mov %b0, (%5,%4) \n"
00632 "inc %4 \n"
00633 "jl 1b \n"
00634 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
00635 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
00636 );
00637 *left = l;
00638 *left_top = tl;
00639 }
00640 #endif
00641
00642 #define H263_LOOP_FILTER \
00643 "pxor %%mm7, %%mm7 \n\t"\
00644 "movq %0, %%mm0 \n\t"\
00645 "movq %0, %%mm1 \n\t"\
00646 "movq %3, %%mm2 \n\t"\
00647 "movq %3, %%mm3 \n\t"\
00648 "punpcklbw %%mm7, %%mm0 \n\t"\
00649 "punpckhbw %%mm7, %%mm1 \n\t"\
00650 "punpcklbw %%mm7, %%mm2 \n\t"\
00651 "punpckhbw %%mm7, %%mm3 \n\t"\
00652 "psubw %%mm2, %%mm0 \n\t"\
00653 "psubw %%mm3, %%mm1 \n\t"\
00654 "movq %1, %%mm2 \n\t"\
00655 "movq %1, %%mm3 \n\t"\
00656 "movq %2, %%mm4 \n\t"\
00657 "movq %2, %%mm5 \n\t"\
00658 "punpcklbw %%mm7, %%mm2 \n\t"\
00659 "punpckhbw %%mm7, %%mm3 \n\t"\
00660 "punpcklbw %%mm7, %%mm4 \n\t"\
00661 "punpckhbw %%mm7, %%mm5 \n\t"\
00662 "psubw %%mm2, %%mm4 \n\t"\
00663 "psubw %%mm3, %%mm5 \n\t"\
00664 "psllw $2, %%mm4 \n\t"\
00665 "psllw $2, %%mm5 \n\t"\
00666 "paddw %%mm0, %%mm4 \n\t"\
00667 "paddw %%mm1, %%mm5 \n\t"\
00668 "pxor %%mm6, %%mm6 \n\t"\
00669 "pcmpgtw %%mm4, %%mm6 \n\t"\
00670 "pcmpgtw %%mm5, %%mm7 \n\t"\
00671 "pxor %%mm6, %%mm4 \n\t"\
00672 "pxor %%mm7, %%mm5 \n\t"\
00673 "psubw %%mm6, %%mm4 \n\t"\
00674 "psubw %%mm7, %%mm5 \n\t"\
00675 "psrlw $3, %%mm4 \n\t"\
00676 "psrlw $3, %%mm5 \n\t"\
00677 "packuswb %%mm5, %%mm4 \n\t"\
00678 "packsswb %%mm7, %%mm6 \n\t"\
00679 "pxor %%mm7, %%mm7 \n\t"\
00680 "movd %4, %%mm2 \n\t"\
00681 "punpcklbw %%mm2, %%mm2 \n\t"\
00682 "punpcklbw %%mm2, %%mm2 \n\t"\
00683 "punpcklbw %%mm2, %%mm2 \n\t"\
00684 "psubusb %%mm4, %%mm2 \n\t"\
00685 "movq %%mm2, %%mm3 \n\t"\
00686 "psubusb %%mm4, %%mm3 \n\t"\
00687 "psubb %%mm3, %%mm2 \n\t"\
00688 "movq %1, %%mm3 \n\t"\
00689 "movq %2, %%mm4 \n\t"\
00690 "pxor %%mm6, %%mm3 \n\t"\
00691 "pxor %%mm6, %%mm4 \n\t"\
00692 "paddusb %%mm2, %%mm3 \n\t"\
00693 "psubusb %%mm2, %%mm4 \n\t"\
00694 "pxor %%mm6, %%mm3 \n\t"\
00695 "pxor %%mm6, %%mm4 \n\t"\
00696 "paddusb %%mm2, %%mm2 \n\t"\
00697 "packsswb %%mm1, %%mm0 \n\t"\
00698 "pcmpgtb %%mm0, %%mm7 \n\t"\
00699 "pxor %%mm7, %%mm0 \n\t"\
00700 "psubb %%mm7, %%mm0 \n\t"\
00701 "movq %%mm0, %%mm1 \n\t"\
00702 "psubusb %%mm2, %%mm0 \n\t"\
00703 "psubb %%mm0, %%mm1 \n\t"\
00704 "pand %5, %%mm1 \n\t"\
00705 "psrlw $2, %%mm1 \n\t"\
00706 "pxor %%mm7, %%mm1 \n\t"\
00707 "psubb %%mm7, %%mm1 \n\t"\
00708 "movq %0, %%mm5 \n\t"\
00709 "movq %3, %%mm6 \n\t"\
00710 "psubb %%mm1, %%mm5 \n\t"\
00711 "paddb %%mm1, %%mm6 \n\t"
00712
00713 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00714 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00715 const int strength= ff_h263_loop_filter_strength[qscale];
00716
00717 __asm__ volatile(
00718
00719 H263_LOOP_FILTER
00720
00721 "movq %%mm3, %1 \n\t"
00722 "movq %%mm4, %2 \n\t"
00723 "movq %%mm5, %0 \n\t"
00724 "movq %%mm6, %3 \n\t"
00725 : "+m" (*(uint64_t*)(src - 2*stride)),
00726 "+m" (*(uint64_t*)(src - 1*stride)),
00727 "+m" (*(uint64_t*)(src + 0*stride)),
00728 "+m" (*(uint64_t*)(src + 1*stride))
00729 : "g" (2*strength), "m"(ff_pb_FC)
00730 );
00731 }
00732 }
00733
00734 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00735 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00736 const int strength= ff_h263_loop_filter_strength[qscale];
00737 DECLARE_ALIGNED(8, uint64_t, temp)[4];
00738 uint8_t *btemp= (uint8_t*)temp;
00739
00740 src -= 2;
00741
00742 transpose4x4(btemp , src , 8, stride);
00743 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00744 __asm__ volatile(
00745 H263_LOOP_FILTER
00746
00747 : "+m" (temp[0]),
00748 "+m" (temp[1]),
00749 "+m" (temp[2]),
00750 "+m" (temp[3])
00751 : "g" (2*strength), "m"(ff_pb_FC)
00752 );
00753
00754 __asm__ volatile(
00755 "movq %%mm5, %%mm1 \n\t"
00756 "movq %%mm4, %%mm0 \n\t"
00757 "punpcklbw %%mm3, %%mm5 \n\t"
00758 "punpcklbw %%mm6, %%mm4 \n\t"
00759 "punpckhbw %%mm3, %%mm1 \n\t"
00760 "punpckhbw %%mm6, %%mm0 \n\t"
00761 "movq %%mm5, %%mm3 \n\t"
00762 "movq %%mm1, %%mm6 \n\t"
00763 "punpcklwd %%mm4, %%mm5 \n\t"
00764 "punpcklwd %%mm0, %%mm1 \n\t"
00765 "punpckhwd %%mm4, %%mm3 \n\t"
00766 "punpckhwd %%mm0, %%mm6 \n\t"
00767 "movd %%mm5, (%0) \n\t"
00768 "punpckhdq %%mm5, %%mm5 \n\t"
00769 "movd %%mm5, (%0,%2) \n\t"
00770 "movd %%mm3, (%0,%2,2) \n\t"
00771 "punpckhdq %%mm3, %%mm3 \n\t"
00772 "movd %%mm3, (%0,%3) \n\t"
00773 "movd %%mm1, (%1) \n\t"
00774 "punpckhdq %%mm1, %%mm1 \n\t"
00775 "movd %%mm1, (%1,%2) \n\t"
00776 "movd %%mm6, (%1,%2,2) \n\t"
00777 "punpckhdq %%mm6, %%mm6 \n\t"
00778 "movd %%mm6, (%1,%3) \n\t"
00779 :: "r" (src),
00780 "r" (src + 4*stride),
00781 "r" ((x86_reg) stride ),
00782 "r" ((x86_reg)(3*stride))
00783 );
00784 }
00785 }
00786
00787
00788
00789 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
00790 {
00791 uint8_t *ptr, *last_line;
00792 int i;
00793
00794 last_line = buf + (height - 1) * wrap;
00795
00796 ptr = buf;
00797 if(w==8)
00798 {
00799 __asm__ volatile(
00800 "1: \n\t"
00801 "movd (%0), %%mm0 \n\t"
00802 "punpcklbw %%mm0, %%mm0 \n\t"
00803 "punpcklwd %%mm0, %%mm0 \n\t"
00804 "punpckldq %%mm0, %%mm0 \n\t"
00805 "movq %%mm0, -8(%0) \n\t"
00806 "movq -8(%0, %2), %%mm1 \n\t"
00807 "punpckhbw %%mm1, %%mm1 \n\t"
00808 "punpckhwd %%mm1, %%mm1 \n\t"
00809 "punpckhdq %%mm1, %%mm1 \n\t"
00810 "movq %%mm1, (%0, %2) \n\t"
00811 "add %1, %0 \n\t"
00812 "cmp %3, %0 \n\t"
00813 " jb 1b \n\t"
00814 : "+r" (ptr)
00815 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00816 );
00817 }
00818 else
00819 {
00820 __asm__ volatile(
00821 "1: \n\t"
00822 "movd (%0), %%mm0 \n\t"
00823 "punpcklbw %%mm0, %%mm0 \n\t"
00824 "punpcklwd %%mm0, %%mm0 \n\t"
00825 "punpckldq %%mm0, %%mm0 \n\t"
00826 "movq %%mm0, -8(%0) \n\t"
00827 "movq %%mm0, -16(%0) \n\t"
00828 "movq -8(%0, %2), %%mm1 \n\t"
00829 "punpckhbw %%mm1, %%mm1 \n\t"
00830 "punpckhwd %%mm1, %%mm1 \n\t"
00831 "punpckhdq %%mm1, %%mm1 \n\t"
00832 "movq %%mm1, (%0, %2) \n\t"
00833 "movq %%mm1, 8(%0, %2) \n\t"
00834 "add %1, %0 \n\t"
00835 "cmp %3, %0 \n\t"
00836 " jb 1b \n\t"
00837 : "+r" (ptr)
00838 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00839 );
00840 }
00841
00842
00843 if (sides&EDGE_TOP) {
00844 for(i = 0; i < h; i += 4) {
00845 ptr= buf - (i + 1) * wrap - w;
00846 __asm__ volatile(
00847 "1: \n\t"
00848 "movq (%1, %0), %%mm0 \n\t"
00849 "movq %%mm0, (%0) \n\t"
00850 "movq %%mm0, (%0, %2) \n\t"
00851 "movq %%mm0, (%0, %2, 2) \n\t"
00852 "movq %%mm0, (%0, %3) \n\t"
00853 "add $8, %0 \n\t"
00854 "cmp %4, %0 \n\t"
00855 " jb 1b \n\t"
00856 : "+r" (ptr)
00857 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
00858 );
00859 }
00860 }
00861
00862 if (sides&EDGE_BOTTOM) {
00863 for(i = 0; i < w; i += 4) {
00864 ptr= last_line + (i + 1) * wrap - w;
00865 __asm__ volatile(
00866 "1: \n\t"
00867 "movq (%1, %0), %%mm0 \n\t"
00868 "movq %%mm0, (%0) \n\t"
00869 "movq %%mm0, (%0, %2) \n\t"
00870 "movq %%mm0, (%0, %2, 2) \n\t"
00871 "movq %%mm0, (%0, %3) \n\t"
00872 "add $8, %0 \n\t"
00873 "cmp %4, %0 \n\t"
00874 " jb 1b \n\t"
00875 : "+r" (ptr)
00876 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
00877 );
00878 }
00879 }
00880 }
00881
00882 #define PAETH(cpu, abs3)\
00883 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
00884 {\
00885 x86_reg i = -bpp;\
00886 x86_reg end = w-3;\
00887 __asm__ volatile(\
00888 "pxor %%mm7, %%mm7 \n"\
00889 "movd (%1,%0), %%mm0 \n"\
00890 "movd (%2,%0), %%mm1 \n"\
00891 "punpcklbw %%mm7, %%mm0 \n"\
00892 "punpcklbw %%mm7, %%mm1 \n"\
00893 "add %4, %0 \n"\
00894 "1: \n"\
00895 "movq %%mm1, %%mm2 \n"\
00896 "movd (%2,%0), %%mm1 \n"\
00897 "movq %%mm2, %%mm3 \n"\
00898 "punpcklbw %%mm7, %%mm1 \n"\
00899 "movq %%mm2, %%mm4 \n"\
00900 "psubw %%mm1, %%mm3 \n"\
00901 "psubw %%mm0, %%mm4 \n"\
00902 "movq %%mm3, %%mm5 \n"\
00903 "paddw %%mm4, %%mm5 \n"\
00904 abs3\
00905 "movq %%mm4, %%mm6 \n"\
00906 "pminsw %%mm5, %%mm6 \n"\
00907 "pcmpgtw %%mm6, %%mm3 \n"\
00908 "pcmpgtw %%mm5, %%mm4 \n"\
00909 "movq %%mm4, %%mm6 \n"\
00910 "pand %%mm3, %%mm4 \n"\
00911 "pandn %%mm3, %%mm6 \n"\
00912 "pandn %%mm0, %%mm3 \n"\
00913 "movd (%3,%0), %%mm0 \n"\
00914 "pand %%mm1, %%mm6 \n"\
00915 "pand %%mm4, %%mm2 \n"\
00916 "punpcklbw %%mm7, %%mm0 \n"\
00917 "movq %6, %%mm5 \n"\
00918 "paddw %%mm6, %%mm0 \n"\
00919 "paddw %%mm2, %%mm3 \n"\
00920 "paddw %%mm3, %%mm0 \n"\
00921 "pand %%mm5, %%mm0 \n"\
00922 "movq %%mm0, %%mm3 \n"\
00923 "packuswb %%mm3, %%mm3 \n"\
00924 "movd %%mm3, (%1,%0) \n"\
00925 "add %4, %0 \n"\
00926 "cmp %5, %0 \n"\
00927 "jle 1b \n"\
00928 :"+r"(i)\
00929 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
00930 "m"(ff_pw_255)\
00931 :"memory"\
00932 );\
00933 }
00934
00935 #define ABS3_MMX2\
00936 "psubw %%mm5, %%mm7 \n"\
00937 "pmaxsw %%mm7, %%mm5 \n"\
00938 "pxor %%mm6, %%mm6 \n"\
00939 "pxor %%mm7, %%mm7 \n"\
00940 "psubw %%mm3, %%mm6 \n"\
00941 "psubw %%mm4, %%mm7 \n"\
00942 "pmaxsw %%mm6, %%mm3 \n"\
00943 "pmaxsw %%mm7, %%mm4 \n"\
00944 "pxor %%mm7, %%mm7 \n"
00945
00946 #define ABS3_SSSE3\
00947 "pabsw %%mm3, %%mm3 \n"\
00948 "pabsw %%mm4, %%mm4 \n"\
00949 "pabsw %%mm5, %%mm5 \n"
00950
00951 PAETH(mmx2, ABS3_MMX2)
00952 #if HAVE_SSSE3
00953 PAETH(ssse3, ABS3_SSSE3)
00954 #endif
00955
00956 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
00957 "paddw " #m4 ", " #m3 " \n\t" \
00958 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
00959 "pmullw " #m3 ", %%mm4 \n\t" \
00960 "movq "#in7", " #m3 " \n\t" \
00961 "movq "#in0", %%mm5 \n\t" \
00962 "paddw " #m3 ", %%mm5 \n\t" \
00963 "psubw %%mm5, %%mm4 \n\t" \
00964 "movq "#in1", %%mm5 \n\t" \
00965 "movq "#in2", %%mm6 \n\t" \
00966 "paddw " #m6 ", %%mm5 \n\t" \
00967 "paddw " #m5 ", %%mm6 \n\t" \
00968 "paddw %%mm6, %%mm6 \n\t" \
00969 "psubw %%mm6, %%mm5 \n\t" \
00970 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
00971 "paddw " #rnd ", %%mm4 \n\t" \
00972 "paddw %%mm4, %%mm5 \n\t" \
00973 "psraw $5, %%mm5 \n\t"\
00974 "packuswb %%mm5, %%mm5 \n\t"\
00975 OP(%%mm5, out, %%mm7, d)
00976
00977 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
00978 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00979 uint64_t temp;\
00980 \
00981 __asm__ volatile(\
00982 "pxor %%mm7, %%mm7 \n\t"\
00983 "1: \n\t"\
00984 "movq (%0), %%mm0 \n\t" \
00985 "movq %%mm0, %%mm1 \n\t" \
00986 "movq %%mm0, %%mm2 \n\t" \
00987 "punpcklbw %%mm7, %%mm0 \n\t" \
00988 "punpckhbw %%mm7, %%mm1 \n\t" \
00989 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
00990 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
00991 "movq %%mm2, %%mm3 \n\t" \
00992 "movq %%mm2, %%mm4 \n\t" \
00993 "psllq $8, %%mm2 \n\t" \
00994 "psllq $16, %%mm3 \n\t" \
00995 "psllq $24, %%mm4 \n\t" \
00996 "punpckhbw %%mm7, %%mm2 \n\t" \
00997 "punpckhbw %%mm7, %%mm3 \n\t" \
00998 "punpckhbw %%mm7, %%mm4 \n\t" \
00999 "paddw %%mm3, %%mm5 \n\t" \
01000 "paddw %%mm2, %%mm6 \n\t" \
01001 "paddw %%mm5, %%mm5 \n\t" \
01002 "psubw %%mm5, %%mm6 \n\t" \
01003 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01004 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01005 "paddw %%mm4, %%mm0 \n\t" \
01006 "paddw %%mm1, %%mm5 \n\t" \
01007 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01008 "psubw %%mm5, %%mm0 \n\t" \
01009 "paddw %6, %%mm6 \n\t"\
01010 "paddw %%mm6, %%mm0 \n\t" \
01011 "psraw $5, %%mm0 \n\t"\
01012 "movq %%mm0, %5 \n\t"\
01013 \
01014 \
01015 "movq 5(%0), %%mm0 \n\t" \
01016 "movq %%mm0, %%mm5 \n\t" \
01017 "movq %%mm0, %%mm6 \n\t" \
01018 "psrlq $8, %%mm0 \n\t" \
01019 "psrlq $16, %%mm5 \n\t" \
01020 "punpcklbw %%mm7, %%mm0 \n\t" \
01021 "punpcklbw %%mm7, %%mm5 \n\t" \
01022 "paddw %%mm0, %%mm2 \n\t" \
01023 "paddw %%mm5, %%mm3 \n\t" \
01024 "paddw %%mm2, %%mm2 \n\t" \
01025 "psubw %%mm2, %%mm3 \n\t" \
01026 "movq %%mm6, %%mm2 \n\t" \
01027 "psrlq $24, %%mm6 \n\t" \
01028 "punpcklbw %%mm7, %%mm2 \n\t" \
01029 "punpcklbw %%mm7, %%mm6 \n\t" \
01030 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01031 "paddw %%mm2, %%mm1 \n\t" \
01032 "paddw %%mm6, %%mm4 \n\t" \
01033 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01034 "psubw %%mm4, %%mm3 \n\t" \
01035 "paddw %6, %%mm1 \n\t"\
01036 "paddw %%mm1, %%mm3 \n\t" \
01037 "psraw $5, %%mm3 \n\t"\
01038 "movq %5, %%mm1 \n\t"\
01039 "packuswb %%mm3, %%mm1 \n\t"\
01040 OP_MMX2(%%mm1, (%1),%%mm4, q)\
01041 \
01042 \
01043 "movq 9(%0), %%mm1 \n\t" \
01044 "movq %%mm1, %%mm4 \n\t" \
01045 "movq %%mm1, %%mm3 \n\t" \
01046 "psrlq $8, %%mm1 \n\t" \
01047 "psrlq $16, %%mm4 \n\t" \
01048 "punpcklbw %%mm7, %%mm1 \n\t" \
01049 "punpcklbw %%mm7, %%mm4 \n\t" \
01050 "paddw %%mm1, %%mm5 \n\t" \
01051 "paddw %%mm4, %%mm0 \n\t" \
01052 "paddw %%mm5, %%mm5 \n\t" \
01053 "psubw %%mm5, %%mm0 \n\t" \
01054 "movq %%mm3, %%mm5 \n\t" \
01055 "psrlq $24, %%mm3 \n\t" \
01056 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
01057 "punpcklbw %%mm7, %%mm3 \n\t" \
01058 "paddw %%mm3, %%mm2 \n\t" \
01059 "psubw %%mm2, %%mm0 \n\t" \
01060 "movq %%mm5, %%mm2 \n\t" \
01061 "punpcklbw %%mm7, %%mm2 \n\t" \
01062 "punpckhbw %%mm7, %%mm5 \n\t" \
01063 "paddw %%mm2, %%mm6 \n\t" \
01064 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
01065 "paddw %6, %%mm0 \n\t"\
01066 "paddw %%mm6, %%mm0 \n\t" \
01067 "psraw $5, %%mm0 \n\t"\
01068 \
01069 \
01070 "paddw %%mm5, %%mm3 \n\t" \
01071 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01072 "paddw %%mm4, %%mm6 \n\t" \
01073 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
01074 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01075 "paddw %%mm1, %%mm4 \n\t" \
01076 "paddw %%mm2, %%mm5 \n\t" \
01077 "paddw %%mm6, %%mm6 \n\t" \
01078 "psubw %%mm6, %%mm4 \n\t" \
01079 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
01080 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
01081 "psubw %%mm5, %%mm3 \n\t" \
01082 "paddw %6, %%mm4 \n\t"\
01083 "paddw %%mm3, %%mm4 \n\t" \
01084 "psraw $5, %%mm4 \n\t"\
01085 "packuswb %%mm4, %%mm0 \n\t"\
01086 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
01087 \
01088 "add %3, %0 \n\t"\
01089 "add %4, %1 \n\t"\
01090 "decl %2 \n\t"\
01091 " jnz 1b \n\t"\
01092 : "+a"(src), "+c"(dst), "+D"(h)\
01093 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(temp), "m"(ROUNDER)\
01094 : "memory"\
01095 );\
01096 }\
01097 \
01098 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01099 int i;\
01100 int16_t temp[16];\
01101 \
01102 for(i=0; i<h; i++)\
01103 {\
01104 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01105 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01106 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01107 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01108 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01109 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
01110 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
01111 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
01112 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
01113 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
01114 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
01115 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
01116 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
01117 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
01118 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
01119 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
01120 __asm__ volatile(\
01121 "movq (%0), %%mm0 \n\t"\
01122 "movq 8(%0), %%mm1 \n\t"\
01123 "paddw %2, %%mm0 \n\t"\
01124 "paddw %2, %%mm1 \n\t"\
01125 "psraw $5, %%mm0 \n\t"\
01126 "psraw $5, %%mm1 \n\t"\
01127 "packuswb %%mm1, %%mm0 \n\t"\
01128 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01129 "movq 16(%0), %%mm0 \n\t"\
01130 "movq 24(%0), %%mm1 \n\t"\
01131 "paddw %2, %%mm0 \n\t"\
01132 "paddw %2, %%mm1 \n\t"\
01133 "psraw $5, %%mm0 \n\t"\
01134 "psraw $5, %%mm1 \n\t"\
01135 "packuswb %%mm1, %%mm0 \n\t"\
01136 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
01137 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01138 : "memory"\
01139 );\
01140 dst+=dstStride;\
01141 src+=srcStride;\
01142 }\
01143 }\
01144 \
01145 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01146 __asm__ volatile(\
01147 "pxor %%mm7, %%mm7 \n\t"\
01148 "1: \n\t"\
01149 "movq (%0), %%mm0 \n\t" \
01150 "movq %%mm0, %%mm1 \n\t" \
01151 "movq %%mm0, %%mm2 \n\t" \
01152 "punpcklbw %%mm7, %%mm0 \n\t" \
01153 "punpckhbw %%mm7, %%mm1 \n\t" \
01154 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01155 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01156 "movq %%mm2, %%mm3 \n\t" \
01157 "movq %%mm2, %%mm4 \n\t" \
01158 "psllq $8, %%mm2 \n\t" \
01159 "psllq $16, %%mm3 \n\t" \
01160 "psllq $24, %%mm4 \n\t" \
01161 "punpckhbw %%mm7, %%mm2 \n\t" \
01162 "punpckhbw %%mm7, %%mm3 \n\t" \
01163 "punpckhbw %%mm7, %%mm4 \n\t" \
01164 "paddw %%mm3, %%mm5 \n\t" \
01165 "paddw %%mm2, %%mm6 \n\t" \
01166 "paddw %%mm5, %%mm5 \n\t" \
01167 "psubw %%mm5, %%mm6 \n\t" \
01168 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01169 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01170 "paddw %%mm4, %%mm0 \n\t" \
01171 "paddw %%mm1, %%mm5 \n\t" \
01172 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01173 "psubw %%mm5, %%mm0 \n\t" \
01174 "paddw %5, %%mm6 \n\t"\
01175 "paddw %%mm6, %%mm0 \n\t" \
01176 "psraw $5, %%mm0 \n\t"\
01177 \
01178 \
01179 "movd 5(%0), %%mm5 \n\t" \
01180 "punpcklbw %%mm7, %%mm5 \n\t" \
01181 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01182 "paddw %%mm5, %%mm1 \n\t" \
01183 "paddw %%mm6, %%mm2 \n\t" \
01184 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01185 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01186 "paddw %%mm6, %%mm3 \n\t" \
01187 "paddw %%mm5, %%mm4 \n\t" \
01188 "paddw %%mm2, %%mm2 \n\t" \
01189 "psubw %%mm2, %%mm3 \n\t" \
01190 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01191 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01192 "psubw %%mm4, %%mm3 \n\t" \
01193 "paddw %5, %%mm1 \n\t"\
01194 "paddw %%mm1, %%mm3 \n\t" \
01195 "psraw $5, %%mm3 \n\t"\
01196 "packuswb %%mm3, %%mm0 \n\t"\
01197 OP_MMX2(%%mm0, (%1), %%mm4, q)\
01198 \
01199 "add %3, %0 \n\t"\
01200 "add %4, %1 \n\t"\
01201 "decl %2 \n\t"\
01202 " jnz 1b \n\t"\
01203 : "+a"(src), "+c"(dst), "+d"(h)\
01204 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ROUNDER)\
01205 : "memory"\
01206 );\
01207 }\
01208 \
01209 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01210 int i;\
01211 int16_t temp[8];\
01212 \
01213 for(i=0; i<h; i++)\
01214 {\
01215 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01216 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01217 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01218 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01219 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01220 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
01221 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
01222 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
01223 __asm__ volatile(\
01224 "movq (%0), %%mm0 \n\t"\
01225 "movq 8(%0), %%mm1 \n\t"\
01226 "paddw %2, %%mm0 \n\t"\
01227 "paddw %2, %%mm1 \n\t"\
01228 "psraw $5, %%mm0 \n\t"\
01229 "psraw $5, %%mm1 \n\t"\
01230 "packuswb %%mm1, %%mm0 \n\t"\
01231 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01232 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01233 :"memory"\
01234 );\
01235 dst+=dstStride;\
01236 src+=srcStride;\
01237 }\
01238 }
01239
01240 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
01241 \
01242 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01243 uint64_t temp[17*4];\
01244 uint64_t *temp_ptr= temp;\
01245 int count= 17;\
01246 \
01247 \
01248 __asm__ volatile(\
01249 "pxor %%mm7, %%mm7 \n\t"\
01250 "1: \n\t"\
01251 "movq (%0), %%mm0 \n\t"\
01252 "movq (%0), %%mm1 \n\t"\
01253 "movq 8(%0), %%mm2 \n\t"\
01254 "movq 8(%0), %%mm3 \n\t"\
01255 "punpcklbw %%mm7, %%mm0 \n\t"\
01256 "punpckhbw %%mm7, %%mm1 \n\t"\
01257 "punpcklbw %%mm7, %%mm2 \n\t"\
01258 "punpckhbw %%mm7, %%mm3 \n\t"\
01259 "movq %%mm0, (%1) \n\t"\
01260 "movq %%mm1, 17*8(%1) \n\t"\
01261 "movq %%mm2, 2*17*8(%1) \n\t"\
01262 "movq %%mm3, 3*17*8(%1) \n\t"\
01263 "add $8, %1 \n\t"\
01264 "add %3, %0 \n\t"\
01265 "decl %2 \n\t"\
01266 " jnz 1b \n\t"\
01267 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01268 : "r" ((x86_reg)srcStride)\
01269 : "memory"\
01270 );\
01271 \
01272 temp_ptr= temp;\
01273 count=4;\
01274 \
01275 \
01276 __asm__ volatile(\
01277 \
01278 "1: \n\t"\
01279 "movq (%0), %%mm0 \n\t"\
01280 "movq 8(%0), %%mm1 \n\t"\
01281 "movq 16(%0), %%mm2 \n\t"\
01282 "movq 24(%0), %%mm3 \n\t"\
01283 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01284 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01285 "add %4, %1 \n\t"\
01286 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01287 \
01288 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01289 "add %4, %1 \n\t"\
01290 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01291 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
01292 "add %4, %1 \n\t"\
01293 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
01294 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
01295 "add %4, %1 \n\t"\
01296 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
01297 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
01298 "add %4, %1 \n\t"\
01299 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
01300 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
01301 "add %4, %1 \n\t"\
01302 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
01303 \
01304 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
01305 "add %4, %1 \n\t" \
01306 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
01307 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
01308 \
01309 "add $136, %0 \n\t"\
01310 "add %6, %1 \n\t"\
01311 "decl %2 \n\t"\
01312 " jnz 1b \n\t"\
01313 \
01314 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
01315 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
01316 :"memory"\
01317 );\
01318 }\
01319 \
01320 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01321 uint64_t temp[9*2];\
01322 uint64_t *temp_ptr= temp;\
01323 int count= 9;\
01324 \
01325 \
01326 __asm__ volatile(\
01327 "pxor %%mm7, %%mm7 \n\t"\
01328 "1: \n\t"\
01329 "movq (%0), %%mm0 \n\t"\
01330 "movq (%0), %%mm1 \n\t"\
01331 "punpcklbw %%mm7, %%mm0 \n\t"\
01332 "punpckhbw %%mm7, %%mm1 \n\t"\
01333 "movq %%mm0, (%1) \n\t"\
01334 "movq %%mm1, 9*8(%1) \n\t"\
01335 "add $8, %1 \n\t"\
01336 "add %3, %0 \n\t"\
01337 "decl %2 \n\t"\
01338 " jnz 1b \n\t"\
01339 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01340 : "r" ((x86_reg)srcStride)\
01341 : "memory"\
01342 );\
01343 \
01344 temp_ptr= temp;\
01345 count=2;\
01346 \
01347 \
01348 __asm__ volatile(\
01349 \
01350 "1: \n\t"\
01351 "movq (%0), %%mm0 \n\t"\
01352 "movq 8(%0), %%mm1 \n\t"\
01353 "movq 16(%0), %%mm2 \n\t"\
01354 "movq 24(%0), %%mm3 \n\t"\
01355 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01356 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01357 "add %4, %1 \n\t"\
01358 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01359 \
01360 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01361 "add %4, %1 \n\t"\
01362 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01363 \
01364 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
01365 "add %4, %1 \n\t"\
01366 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
01367 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
01368 \
01369 "add $72, %0 \n\t"\
01370 "add %6, %1 \n\t"\
01371 "decl %2 \n\t"\
01372 " jnz 1b \n\t"\
01373 \
01374 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
01375 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
01376 : "memory"\
01377 );\
01378 }\
01379 \
01380 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01381 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
01382 }\
01383 \
01384 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01385 uint64_t temp[8];\
01386 uint8_t * const half= (uint8_t*)temp;\
01387 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
01388 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
01389 }\
01390 \
01391 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01392 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
01393 }\
01394 \
01395 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01396 uint64_t temp[8];\
01397 uint8_t * const half= (uint8_t*)temp;\
01398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
01399 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
01400 }\
01401 \
01402 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01403 uint64_t temp[8];\
01404 uint8_t * const half= (uint8_t*)temp;\
01405 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
01406 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
01407 }\
01408 \
01409 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01410 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
01411 }\
01412 \
01413 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01414 uint64_t temp[8];\
01415 uint8_t * const half= (uint8_t*)temp;\
01416 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
01417 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
01418 }\
01419 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01420 uint64_t half[8 + 9];\
01421 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01422 uint8_t * const halfHV= ((uint8_t*)half);\
01423 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01424 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01427 }\
01428 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01429 uint64_t half[8 + 9];\
01430 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01431 uint8_t * const halfHV= ((uint8_t*)half);\
01432 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01433 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01434 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01435 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01436 }\
01437 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01438 uint64_t half[8 + 9];\
01439 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01440 uint8_t * const halfHV= ((uint8_t*)half);\
01441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01442 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01443 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01444 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01445 }\
01446 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01447 uint64_t half[8 + 9];\
01448 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01449 uint8_t * const halfHV= ((uint8_t*)half);\
01450 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01451 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01454 }\
01455 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01456 uint64_t half[8 + 9];\
01457 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01458 uint8_t * const halfHV= ((uint8_t*)half);\
01459 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01460 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01461 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
01462 }\
01463 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01464 uint64_t half[8 + 9];\
01465 uint8_t * const halfH= ((uint8_t*)half) + 64;\
01466 uint8_t * const halfHV= ((uint8_t*)half);\
01467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01468 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
01469 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
01470 }\
01471 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01472 uint64_t half[8 + 9];\
01473 uint8_t * const halfH= ((uint8_t*)half);\
01474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01475 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
01476 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01477 }\
01478 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01479 uint64_t half[8 + 9];\
01480 uint8_t * const halfH= ((uint8_t*)half);\
01481 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01482 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
01483 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01484 }\
01485 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01486 uint64_t half[9];\
01487 uint8_t * const halfH= ((uint8_t*)half);\
01488 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
01489 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
01490 }\
01491 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01492 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
01493 }\
01494 \
01495 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01496 uint64_t temp[32];\
01497 uint8_t * const half= (uint8_t*)temp;\
01498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
01499 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
01500 }\
01501 \
01502 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01503 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
01504 }\
01505 \
01506 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01507 uint64_t temp[32];\
01508 uint8_t * const half= (uint8_t*)temp;\
01509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
01510 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
01511 }\
01512 \
01513 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01514 uint64_t temp[32];\
01515 uint8_t * const half= (uint8_t*)temp;\
01516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
01517 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
01518 }\
01519 \
01520 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01521 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
01522 }\
01523 \
01524 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01525 uint64_t temp[32];\
01526 uint8_t * const half= (uint8_t*)temp;\
01527 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
01528 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
01529 }\
01530 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01531 uint64_t half[16*2 + 17*2];\
01532 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01533 uint8_t * const halfHV= ((uint8_t*)half);\
01534 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01535 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01536 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01537 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01538 }\
01539 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01540 uint64_t half[16*2 + 17*2];\
01541 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01542 uint8_t * const halfHV= ((uint8_t*)half);\
01543 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01544 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01545 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01546 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01547 }\
01548 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01549 uint64_t half[16*2 + 17*2];\
01550 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01551 uint8_t * const halfHV= ((uint8_t*)half);\
01552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01553 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01554 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01555 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01556 }\
01557 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01558 uint64_t half[16*2 + 17*2];\
01559 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01560 uint8_t * const halfHV= ((uint8_t*)half);\
01561 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01562 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01563 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01564 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01565 }\
01566 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01567 uint64_t half[16*2 + 17*2];\
01568 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01569 uint8_t * const halfHV= ((uint8_t*)half);\
01570 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01571 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01572 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
01573 }\
01574 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01575 uint64_t half[16*2 + 17*2];\
01576 uint8_t * const halfH= ((uint8_t*)half) + 256;\
01577 uint8_t * const halfHV= ((uint8_t*)half);\
01578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01579 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
01580 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
01581 }\
01582 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01583 uint64_t half[17*2];\
01584 uint8_t * const halfH= ((uint8_t*)half);\
01585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01586 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
01587 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01588 }\
01589 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01590 uint64_t half[17*2];\
01591 uint8_t * const halfH= ((uint8_t*)half);\
01592 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01593 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
01594 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01595 }\
01596 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01597 uint64_t half[17*2];\
01598 uint8_t * const halfH= ((uint8_t*)half);\
01599 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
01600 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
01601 }
01602
01603 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
01604 #define AVG_3DNOW_OP(a,b,temp, size) \
01605 "mov" #size " " #b ", " #temp " \n\t"\
01606 "pavgusb " #temp ", " #a " \n\t"\
01607 "mov" #size " " #a ", " #b " \n\t"
01608 #define AVG_MMX2_OP(a,b,temp, size) \
01609 "mov" #size " " #b ", " #temp " \n\t"\
01610 "pavgb " #temp ", " #a " \n\t"\
01611 "mov" #size " " #a ", " #b " \n\t"
01612
01613 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
01614 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
01615 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
01616 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
01617 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
01618 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
01619 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
01620 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
01621 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
01622
01623
01624
01625
01626 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
01627 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01628 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
01629 }
01630 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
01631 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01632 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
01633 }
01634
01635 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
01636 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
01637 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
01638 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
01639 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
01640 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
01641 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
01642 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
01643 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
01644 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
01645 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01646 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
01647 }\
01648 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01649 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
01650 }\
01651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
01652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
01653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
01654 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
01655 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
01656 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
01657 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
01658 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
01659
01660 QPEL_2TAP(put_, 16, mmx2)
01661 QPEL_2TAP(avg_, 16, mmx2)
01662 QPEL_2TAP(put_, 8, mmx2)
01663 QPEL_2TAP(avg_, 8, mmx2)
01664 QPEL_2TAP(put_, 16, 3dnow)
01665 QPEL_2TAP(avg_, 16, 3dnow)
01666 QPEL_2TAP(put_, 8, 3dnow)
01667 QPEL_2TAP(avg_, 8, 3dnow)
01668
01669
01670 #if HAVE_YASM
01671 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
01672 x86_reg linesize, x86_reg start_y,
01673 x86_reg end_y, x86_reg block_h,
01674 x86_reg start_x, x86_reg end_x,
01675 x86_reg block_w);
01676 extern emu_edge_core_func ff_emu_edge_core_mmx;
01677 extern emu_edge_core_func ff_emu_edge_core_sse;
01678
01679 static av_always_inline
01680 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
01681 int block_w, int block_h,
01682 int src_x, int src_y, int w, int h,
01683 emu_edge_core_func *core_fn)
01684 {
01685 int start_y, start_x, end_y, end_x, src_y_add=0;
01686
01687 if(src_y>= h){
01688 src_y_add = h-1-src_y;
01689 src_y=h-1;
01690 }else if(src_y<=-block_h){
01691 src_y_add = 1-block_h-src_y;
01692 src_y=1-block_h;
01693 }
01694 if(src_x>= w){
01695 src+= (w-1-src_x);
01696 src_x=w-1;
01697 }else if(src_x<=-block_w){
01698 src+= (1-block_w-src_x);
01699 src_x=1-block_w;
01700 }
01701
01702 start_y= FFMAX(0, -src_y);
01703 start_x= FFMAX(0, -src_x);
01704 end_y= FFMIN(block_h, h-src_y);
01705 end_x= FFMIN(block_w, w-src_x);
01706 assert(start_x < end_x && block_w > 0);
01707 assert(start_y < end_y && block_h > 0);
01708
01709
01710 src += (src_y_add+start_y)*linesize + start_x;
01711 buf += start_x;
01712 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
01713 }
01714
01715 #if ARCH_X86_32
01716 static av_noinline
01717 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
01718 int block_w, int block_h,
01719 int src_x, int src_y, int w, int h)
01720 {
01721 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01722 w, h, &ff_emu_edge_core_mmx);
01723 }
01724 #endif
01725 static av_noinline
01726 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
01727 int block_w, int block_h,
01728 int src_x, int src_y, int w, int h)
01729 {
01730 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01731 w, h, &ff_emu_edge_core_sse);
01732 }
01733 #endif
01734
01735 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
01736 int linesize, int block_w, int block_h,
01737 int src_x, int src_y, int w, int h);
01738
01739 static av_always_inline
01740 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01741 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
01742 emulated_edge_mc_func *emu_edge_fn)
01743 {
01744 const int w = 8;
01745 const int ix = ox>>(16+shift);
01746 const int iy = oy>>(16+shift);
01747 const int oxs = ox>>4;
01748 const int oys = oy>>4;
01749 const int dxxs = dxx>>4;
01750 const int dxys = dxy>>4;
01751 const int dyxs = dyx>>4;
01752 const int dyys = dyy>>4;
01753 const uint16_t r4[4] = {r,r,r,r};
01754 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
01755 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
01756 const uint64_t shift2 = 2*shift;
01757 uint8_t edge_buf[(h+1)*stride];
01758 int x, y;
01759
01760 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
01761 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
01762 const int dxh = dxy*(h-1);
01763 const int dyw = dyx*(w-1);
01764 if(
01765 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
01766 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
01767
01768 || (dxx|dxy|dyx|dyy)&15 )
01769 {
01770
01771 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
01772 return;
01773 }
01774
01775 src += ix + iy*stride;
01776 if( (unsigned)ix >= width-w ||
01777 (unsigned)iy >= height-h )
01778 {
01779 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
01780 src = edge_buf;
01781 }
01782
01783 __asm__ volatile(
01784 "movd %0, %%mm6 \n\t"
01785 "pxor %%mm7, %%mm7 \n\t"
01786 "punpcklwd %%mm6, %%mm6 \n\t"
01787 "punpcklwd %%mm6, %%mm6 \n\t"
01788 :: "r"(1<<shift)
01789 );
01790
01791 for(x=0; x<w; x+=4){
01792 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
01793 oxs - dxys + dxxs*(x+1),
01794 oxs - dxys + dxxs*(x+2),
01795 oxs - dxys + dxxs*(x+3) };
01796 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
01797 oys - dyys + dyxs*(x+1),
01798 oys - dyys + dyxs*(x+2),
01799 oys - dyys + dyxs*(x+3) };
01800
01801 for(y=0; y<h; y++){
01802 __asm__ volatile(
01803 "movq %0, %%mm4 \n\t"
01804 "movq %1, %%mm5 \n\t"
01805 "paddw %2, %%mm4 \n\t"
01806 "paddw %3, %%mm5 \n\t"
01807 "movq %%mm4, %0 \n\t"
01808 "movq %%mm5, %1 \n\t"
01809 "psrlw $12, %%mm4 \n\t"
01810 "psrlw $12, %%mm5 \n\t"
01811 : "+m"(*dx4), "+m"(*dy4)
01812 : "m"(*dxy4), "m"(*dyy4)
01813 );
01814
01815 __asm__ volatile(
01816 "movq %%mm6, %%mm2 \n\t"
01817 "movq %%mm6, %%mm1 \n\t"
01818 "psubw %%mm4, %%mm2 \n\t"
01819 "psubw %%mm5, %%mm1 \n\t"
01820 "movq %%mm2, %%mm0 \n\t"
01821 "movq %%mm4, %%mm3 \n\t"
01822 "pmullw %%mm1, %%mm0 \n\t"
01823 "pmullw %%mm5, %%mm3 \n\t"
01824 "pmullw %%mm5, %%mm2 \n\t"
01825 "pmullw %%mm4, %%mm1 \n\t"
01826
01827 "movd %4, %%mm5 \n\t"
01828 "movd %3, %%mm4 \n\t"
01829 "punpcklbw %%mm7, %%mm5 \n\t"
01830 "punpcklbw %%mm7, %%mm4 \n\t"
01831 "pmullw %%mm5, %%mm3 \n\t"
01832 "pmullw %%mm4, %%mm2 \n\t"
01833
01834 "movd %2, %%mm5 \n\t"
01835 "movd %1, %%mm4 \n\t"
01836 "punpcklbw %%mm7, %%mm5 \n\t"
01837 "punpcklbw %%mm7, %%mm4 \n\t"
01838 "pmullw %%mm5, %%mm1 \n\t"
01839 "pmullw %%mm4, %%mm0 \n\t"
01840 "paddw %5, %%mm1 \n\t"
01841 "paddw %%mm3, %%mm2 \n\t"
01842 "paddw %%mm1, %%mm0 \n\t"
01843 "paddw %%mm2, %%mm0 \n\t"
01844
01845 "psrlw %6, %%mm0 \n\t"
01846 "packuswb %%mm0, %%mm0 \n\t"
01847 "movd %%mm0, %0 \n\t"
01848
01849 : "=m"(dst[x+y*stride])
01850 : "m"(src[0]), "m"(src[1]),
01851 "m"(src[stride]), "m"(src[stride+1]),
01852 "m"(*r4), "m"(shift2)
01853 );
01854 src += stride;
01855 }
01856 src += 4-h*stride;
01857 }
01858 }
01859
01860 #if HAVE_YASM
01861 #if ARCH_X86_32
01862 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01863 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01864 {
01865 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01866 width, height, &emulated_edge_mc_mmx);
01867 }
01868 #endif
01869 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01870 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01871 {
01872 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01873 width, height, &emulated_edge_mc_sse);
01874 }
01875 #else
01876 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01877 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01878 {
01879 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01880 width, height, &ff_emulated_edge_mc_8);
01881 }
01882 #endif
01883
01884 #define PREFETCH(name, op) \
01885 static void name(void *mem, int stride, int h){\
01886 const uint8_t *p= mem;\
01887 do{\
01888 __asm__ volatile(#op" %0" :: "m"(*p));\
01889 p+= stride;\
01890 }while(--h);\
01891 }
01892 PREFETCH(prefetch_mmx2, prefetcht0)
01893 PREFETCH(prefetch_3dnow, prefetch)
01894 #undef PREFETCH
01895
01896 #include "h264_qpel_mmx.c"
01897
01898 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
01899 int stride, int h, int x, int y);
01900 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
01901 int stride, int h, int x, int y);
01902 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
01903 int stride, int h, int x, int y);
01904
01905 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
01906 int stride, int h, int x, int y);
01907 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
01908 int stride, int h, int x, int y);
01909 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
01910 int stride, int h, int x, int y);
01911
01912 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
01913 int stride, int h, int x, int y);
01914 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
01915 int stride, int h, int x, int y);
01916
01917 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
01918 int stride, int h, int x, int y);
01919 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01920 int stride, int h, int x, int y);
01921
01922 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
01923 int stride, int h, int x, int y);
01924 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01925 int stride, int h, int x, int y);
01926
01927 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
01928 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
01929 (uint8_t *dst, uint8_t *src,\
01930 int stride, int h, int x, int y);
01931
01932 CHROMA_MC(put, 2, 10, mmxext)
01933 CHROMA_MC(avg, 2, 10, mmxext)
01934 CHROMA_MC(put, 4, 10, mmxext)
01935 CHROMA_MC(avg, 4, 10, mmxext)
01936 CHROMA_MC(put, 8, 10, sse2)
01937 CHROMA_MC(avg, 8, 10, sse2)
01938 CHROMA_MC(put, 8, 10, avx)
01939 CHROMA_MC(avg, 8, 10, avx)
01940
01941
01942 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01943 put_pixels8_mmx(dst, src, stride, 8);
01944 }
01945 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01946 avg_pixels8_mmx(dst, src, stride, 8);
01947 }
01948 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01949 put_pixels16_mmx(dst, src, stride, 16);
01950 }
01951 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
01952 avg_pixels16_mmx(dst, src, stride, 16);
01953 }
01954
01955
01956 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
01957 put_pixels8_mmx(dst, src, stride, 8);
01958 }
01959 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
01960 avg_pixels8_mmx2(dst, src, stride, 8);
01961 }
01962
01963
01964
01965 #if CONFIG_GPL
01966 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
01967 {
01968 ff_mmx_idct (block);
01969 ff_put_pixels_clamped_mmx(block, dest, line_size);
01970 }
01971 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
01972 {
01973 ff_mmx_idct (block);
01974 ff_add_pixels_clamped_mmx(block, dest, line_size);
01975 }
01976 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
01977 {
01978 ff_mmxext_idct (block);
01979 ff_put_pixels_clamped_mmx(block, dest, line_size);
01980 }
01981 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
01982 {
01983 ff_mmxext_idct (block);
01984 ff_add_pixels_clamped_mmx(block, dest, line_size);
01985 }
01986 #endif
01987 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
01988 {
01989 ff_idct_xvid_mmx (block);
01990 ff_put_pixels_clamped_mmx(block, dest, line_size);
01991 }
01992 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
01993 {
01994 ff_idct_xvid_mmx (block);
01995 ff_add_pixels_clamped_mmx(block, dest, line_size);
01996 }
01997 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
01998 {
01999 ff_idct_xvid_mmx2 (block);
02000 ff_put_pixels_clamped_mmx(block, dest, line_size);
02001 }
02002 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
02003 {
02004 ff_idct_xvid_mmx2 (block);
02005 ff_add_pixels_clamped_mmx(block, dest, line_size);
02006 }
02007
02008 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02009 {
02010 int i;
02011 __asm__ volatile("pxor %%mm7, %%mm7":);
02012 for(i=0; i<blocksize; i+=2) {
02013 __asm__ volatile(
02014 "movq %0, %%mm0 \n\t"
02015 "movq %1, %%mm1 \n\t"
02016 "movq %%mm0, %%mm2 \n\t"
02017 "movq %%mm1, %%mm3 \n\t"
02018 "pfcmpge %%mm7, %%mm2 \n\t"
02019 "pfcmpge %%mm7, %%mm3 \n\t"
02020 "pslld $31, %%mm2 \n\t"
02021 "pxor %%mm2, %%mm1 \n\t"
02022 "movq %%mm3, %%mm4 \n\t"
02023 "pand %%mm1, %%mm3 \n\t"
02024 "pandn %%mm1, %%mm4 \n\t"
02025 "pfadd %%mm0, %%mm3 \n\t"
02026 "pfsub %%mm4, %%mm0 \n\t"
02027 "movq %%mm3, %1 \n\t"
02028 "movq %%mm0, %0 \n\t"
02029 :"+m"(mag[i]), "+m"(ang[i])
02030 ::"memory"
02031 );
02032 }
02033 __asm__ volatile("femms");
02034 }
02035 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02036 {
02037 int i;
02038
02039 __asm__ volatile(
02040 "movaps %0, %%xmm5 \n\t"
02041 ::"m"(ff_pdw_80000000[0])
02042 );
02043 for(i=0; i<blocksize; i+=4) {
02044 __asm__ volatile(
02045 "movaps %0, %%xmm0 \n\t"
02046 "movaps %1, %%xmm1 \n\t"
02047 "xorps %%xmm2, %%xmm2 \n\t"
02048 "xorps %%xmm3, %%xmm3 \n\t"
02049 "cmpleps %%xmm0, %%xmm2 \n\t"
02050 "cmpleps %%xmm1, %%xmm3 \n\t"
02051 "andps %%xmm5, %%xmm2 \n\t"
02052 "xorps %%xmm2, %%xmm1 \n\t"
02053 "movaps %%xmm3, %%xmm4 \n\t"
02054 "andps %%xmm1, %%xmm3 \n\t"
02055 "andnps %%xmm1, %%xmm4 \n\t"
02056 "addps %%xmm0, %%xmm3 \n\t"
02057 "subps %%xmm4, %%xmm0 \n\t"
02058 "movaps %%xmm3, %1 \n\t"
02059 "movaps %%xmm0, %0 \n\t"
02060 :"+m"(mag[i]), "+m"(ang[i])
02061 ::"memory"
02062 );
02063 }
02064 }
02065
02066 #define IF1(x) x
02067 #define IF0(x)
02068
02069 #define MIX5(mono,stereo)\
02070 __asm__ volatile(\
02071 "movss 0(%2), %%xmm5 \n"\
02072 "movss 8(%2), %%xmm6 \n"\
02073 "movss 24(%2), %%xmm7 \n"\
02074 "shufps $0, %%xmm5, %%xmm5 \n"\
02075 "shufps $0, %%xmm6, %%xmm6 \n"\
02076 "shufps $0, %%xmm7, %%xmm7 \n"\
02077 "1: \n"\
02078 "movaps (%0,%1), %%xmm0 \n"\
02079 "movaps 0x400(%0,%1), %%xmm1 \n"\
02080 "movaps 0x800(%0,%1), %%xmm2 \n"\
02081 "movaps 0xc00(%0,%1), %%xmm3 \n"\
02082 "movaps 0x1000(%0,%1), %%xmm4 \n"\
02083 "mulps %%xmm5, %%xmm0 \n"\
02084 "mulps %%xmm6, %%xmm1 \n"\
02085 "mulps %%xmm5, %%xmm2 \n"\
02086 "mulps %%xmm7, %%xmm3 \n"\
02087 "mulps %%xmm7, %%xmm4 \n"\
02088 stereo("addps %%xmm1, %%xmm0 \n")\
02089 "addps %%xmm1, %%xmm2 \n"\
02090 "addps %%xmm3, %%xmm0 \n"\
02091 "addps %%xmm4, %%xmm2 \n"\
02092 mono("addps %%xmm2, %%xmm0 \n")\
02093 "movaps %%xmm0, (%0,%1) \n"\
02094 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
02095 "add $16, %0 \n"\
02096 "jl 1b \n"\
02097 :"+&r"(i)\
02098 :"r"(samples[0]+len), "r"(matrix)\
02099 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
02100 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
02101 "memory"\
02102 );
02103
02104 #define MIX_MISC(stereo)\
02105 __asm__ volatile(\
02106 "1: \n"\
02107 "movaps (%3,%0), %%xmm0 \n"\
02108 stereo("movaps %%xmm0, %%xmm1 \n")\
02109 "mulps %%xmm4, %%xmm0 \n"\
02110 stereo("mulps %%xmm5, %%xmm1 \n")\
02111 "lea 1024(%3,%0), %1 \n"\
02112 "mov %5, %2 \n"\
02113 "2: \n"\
02114 "movaps (%1), %%xmm2 \n"\
02115 stereo("movaps %%xmm2, %%xmm3 \n")\
02116 "mulps (%4,%2), %%xmm2 \n"\
02117 stereo("mulps 16(%4,%2), %%xmm3 \n")\
02118 "addps %%xmm2, %%xmm0 \n"\
02119 stereo("addps %%xmm3, %%xmm1 \n")\
02120 "add $1024, %1 \n"\
02121 "add $32, %2 \n"\
02122 "jl 2b \n"\
02123 "movaps %%xmm0, (%3,%0) \n"\
02124 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
02125 "add $16, %0 \n"\
02126 "jl 1b \n"\
02127 :"+&r"(i), "=&r"(j), "=&r"(k)\
02128 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
02129 :"memory"\
02130 );
02131
02132 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
02133 {
02134 int (*matrix_cmp)[2] = (int(*)[2])matrix;
02135 intptr_t i,j,k;
02136
02137 i = -len*sizeof(float);
02138 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
02139 MIX5(IF0,IF1);
02140 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
02141 MIX5(IF1,IF0);
02142 } else {
02143 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
02144 j = 2*in_ch*sizeof(float);
02145 __asm__ volatile(
02146 "1: \n"
02147 "sub $8, %0 \n"
02148 "movss (%2,%0), %%xmm4 \n"
02149 "movss 4(%2,%0), %%xmm5 \n"
02150 "shufps $0, %%xmm4, %%xmm4 \n"
02151 "shufps $0, %%xmm5, %%xmm5 \n"
02152 "movaps %%xmm4, (%1,%0,4) \n"
02153 "movaps %%xmm5, 16(%1,%0,4) \n"
02154 "jg 1b \n"
02155 :"+&r"(j)
02156 :"r"(matrix_simd), "r"(matrix)
02157 :"memory"
02158 );
02159 if(out_ch == 2) {
02160 MIX_MISC(IF1);
02161 } else {
02162 MIX_MISC(IF0);
02163 }
02164 }
02165 }
02166
02167 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
02168 x86_reg i = (len-4)*4;
02169 __asm__ volatile(
02170 "1: \n\t"
02171 "movq (%2,%0), %%mm0 \n\t"
02172 "movq 8(%2,%0), %%mm1 \n\t"
02173 "pfmul (%3,%0), %%mm0 \n\t"
02174 "pfmul 8(%3,%0), %%mm1 \n\t"
02175 "movq %%mm0, (%1,%0) \n\t"
02176 "movq %%mm1, 8(%1,%0) \n\t"
02177 "sub $16, %0 \n\t"
02178 "jge 1b \n\t"
02179 "femms \n\t"
02180 :"+r"(i)
02181 :"r"(dst), "r"(src0), "r"(src1)
02182 :"memory"
02183 );
02184 }
02185 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
02186 x86_reg i = (len-8)*4;
02187 __asm__ volatile(
02188 "1: \n\t"
02189 "movaps (%2,%0), %%xmm0 \n\t"
02190 "movaps 16(%2,%0), %%xmm1 \n\t"
02191 "mulps (%3,%0), %%xmm0 \n\t"
02192 "mulps 16(%3,%0), %%xmm1 \n\t"
02193 "movaps %%xmm0, (%1,%0) \n\t"
02194 "movaps %%xmm1, 16(%1,%0) \n\t"
02195 "sub $32, %0 \n\t"
02196 "jge 1b \n\t"
02197 :"+r"(i)
02198 :"r"(dst), "r"(src0), "r"(src1)
02199 :"memory"
02200 );
02201 }
02202
02203 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
02204 x86_reg i = len*4-16;
02205 __asm__ volatile(
02206 "1: \n\t"
02207 "pswapd 8(%1), %%mm0 \n\t"
02208 "pswapd (%1), %%mm1 \n\t"
02209 "pfmul (%3,%0), %%mm0 \n\t"
02210 "pfmul 8(%3,%0), %%mm1 \n\t"
02211 "movq %%mm0, (%2,%0) \n\t"
02212 "movq %%mm1, 8(%2,%0) \n\t"
02213 "add $16, %1 \n\t"
02214 "sub $16, %0 \n\t"
02215 "jge 1b \n\t"
02216 :"+r"(i), "+r"(src1)
02217 :"r"(dst), "r"(src0)
02218 );
02219 __asm__ volatile("femms");
02220 }
02221 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
02222 x86_reg i = len*4-32;
02223 __asm__ volatile(
02224 "1: \n\t"
02225 "movaps 16(%1), %%xmm0 \n\t"
02226 "movaps (%1), %%xmm1 \n\t"
02227 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
02228 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
02229 "mulps (%3,%0), %%xmm0 \n\t"
02230 "mulps 16(%3,%0), %%xmm1 \n\t"
02231 "movaps %%xmm0, (%2,%0) \n\t"
02232 "movaps %%xmm1, 16(%2,%0) \n\t"
02233 "add $32, %1 \n\t"
02234 "sub $32, %0 \n\t"
02235 "jge 1b \n\t"
02236 :"+r"(i), "+r"(src1)
02237 :"r"(dst), "r"(src0)
02238 );
02239 }
02240
02241 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
02242 const float *src2, int len){
02243 x86_reg i = (len-4)*4;
02244 __asm__ volatile(
02245 "1: \n\t"
02246 "movq (%2,%0), %%mm0 \n\t"
02247 "movq 8(%2,%0), %%mm1 \n\t"
02248 "pfmul (%3,%0), %%mm0 \n\t"
02249 "pfmul 8(%3,%0), %%mm1 \n\t"
02250 "pfadd (%4,%0), %%mm0 \n\t"
02251 "pfadd 8(%4,%0), %%mm1 \n\t"
02252 "movq %%mm0, (%1,%0) \n\t"
02253 "movq %%mm1, 8(%1,%0) \n\t"
02254 "sub $16, %0 \n\t"
02255 "jge 1b \n\t"
02256 :"+r"(i)
02257 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
02258 :"memory"
02259 );
02260 __asm__ volatile("femms");
02261 }
02262 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
02263 const float *src2, int len){
02264 x86_reg i = (len-8)*4;
02265 __asm__ volatile(
02266 "1: \n\t"
02267 "movaps (%2,%0), %%xmm0 \n\t"
02268 "movaps 16(%2,%0), %%xmm1 \n\t"
02269 "mulps (%3,%0), %%xmm0 \n\t"
02270 "mulps 16(%3,%0), %%xmm1 \n\t"
02271 "addps (%4,%0), %%xmm0 \n\t"
02272 "addps 16(%4,%0), %%xmm1 \n\t"
02273 "movaps %%xmm0, (%1,%0) \n\t"
02274 "movaps %%xmm1, 16(%1,%0) \n\t"
02275 "sub $32, %0 \n\t"
02276 "jge 1b \n\t"
02277 :"+r"(i)
02278 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
02279 :"memory"
02280 );
02281 }
02282
02283 #if HAVE_6REGS
02284 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
02285 const float *win, int len){
02286 x86_reg i = -len*4;
02287 x86_reg j = len*4-8;
02288 __asm__ volatile(
02289 "1: \n"
02290 "pswapd (%5,%1), %%mm1 \n"
02291 "movq (%5,%0), %%mm0 \n"
02292 "pswapd (%4,%1), %%mm5 \n"
02293 "movq (%3,%0), %%mm4 \n"
02294 "movq %%mm0, %%mm2 \n"
02295 "movq %%mm1, %%mm3 \n"
02296 "pfmul %%mm4, %%mm2 \n"
02297 "pfmul %%mm5, %%mm3 \n"
02298 "pfmul %%mm4, %%mm1 \n"
02299 "pfmul %%mm5, %%mm0 \n"
02300 "pfadd %%mm3, %%mm2 \n"
02301 "pfsub %%mm0, %%mm1 \n"
02302 "pswapd %%mm2, %%mm2 \n"
02303 "movq %%mm1, (%2,%0) \n"
02304 "movq %%mm2, (%2,%1) \n"
02305 "sub $8, %1 \n"
02306 "add $8, %0 \n"
02307 "jl 1b \n"
02308 "femms \n"
02309 :"+r"(i), "+r"(j)
02310 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
02311 );
02312 }
02313
02314 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
02315 const float *win, int len){
02316 x86_reg i = -len*4;
02317 x86_reg j = len*4-16;
02318 __asm__ volatile(
02319 "1: \n"
02320 "movaps (%5,%1), %%xmm1 \n"
02321 "movaps (%5,%0), %%xmm0 \n"
02322 "movaps (%4,%1), %%xmm5 \n"
02323 "movaps (%3,%0), %%xmm4 \n"
02324 "shufps $0x1b, %%xmm1, %%xmm1 \n"
02325 "shufps $0x1b, %%xmm5, %%xmm5 \n"
02326 "movaps %%xmm0, %%xmm2 \n"
02327 "movaps %%xmm1, %%xmm3 \n"
02328 "mulps %%xmm4, %%xmm2 \n"
02329 "mulps %%xmm5, %%xmm3 \n"
02330 "mulps %%xmm4, %%xmm1 \n"
02331 "mulps %%xmm5, %%xmm0 \n"
02332 "addps %%xmm3, %%xmm2 \n"
02333 "subps %%xmm0, %%xmm1 \n"
02334 "shufps $0x1b, %%xmm2, %%xmm2 \n"
02335 "movaps %%xmm1, (%2,%0) \n"
02336 "movaps %%xmm2, (%2,%1) \n"
02337 "sub $16, %1 \n"
02338 "add $16, %0 \n"
02339 "jl 1b \n"
02340 :"+r"(i), "+r"(j)
02341 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
02342 );
02343 }
02344 #endif
02345
02346 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
02347 int len)
02348 {
02349 x86_reg i = (len-16)*4;
02350 __asm__ volatile(
02351 "movss %3, %%xmm4 \n"
02352 "movss %4, %%xmm5 \n"
02353 "shufps $0, %%xmm4, %%xmm4 \n"
02354 "shufps $0, %%xmm5, %%xmm5 \n"
02355 "1: \n\t"
02356 "movaps (%2,%0), %%xmm0 \n\t"
02357 "movaps 16(%2,%0), %%xmm1 \n\t"
02358 "movaps 32(%2,%0), %%xmm2 \n\t"
02359 "movaps 48(%2,%0), %%xmm3 \n\t"
02360 "maxps %%xmm4, %%xmm0 \n\t"
02361 "maxps %%xmm4, %%xmm1 \n\t"
02362 "maxps %%xmm4, %%xmm2 \n\t"
02363 "maxps %%xmm4, %%xmm3 \n\t"
02364 "minps %%xmm5, %%xmm0 \n\t"
02365 "minps %%xmm5, %%xmm1 \n\t"
02366 "minps %%xmm5, %%xmm2 \n\t"
02367 "minps %%xmm5, %%xmm3 \n\t"
02368 "movaps %%xmm0, (%1,%0) \n\t"
02369 "movaps %%xmm1, 16(%1,%0) \n\t"
02370 "movaps %%xmm2, 32(%1,%0) \n\t"
02371 "movaps %%xmm3, 48(%1,%0) \n\t"
02372 "sub $64, %0 \n\t"
02373 "jge 1b \n\t"
02374 :"+&r"(i)
02375 :"r"(dst), "r"(src), "m"(min), "m"(max)
02376 :"memory"
02377 );
02378 }
02379
02380 void ff_vp3_idct_mmx(int16_t *input_data);
02381 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
02382 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
02383
02384 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
02385
02386 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
02387 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
02388
02389 void ff_vp3_idct_sse2(int16_t *input_data);
02390 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
02391 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
02392
02393 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
02394 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
02395 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02396 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02397 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
02398
02399 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
02400 const int16_t *window, unsigned int len);
02401 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
02402 const int16_t *window, unsigned int len);
02403 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
02404 const int16_t *window, unsigned int len);
02405 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
02406 const int16_t *window, unsigned int len);
02407 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
02408 const int16_t *window, unsigned int len);
02409 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
02410 const int16_t *window, unsigned int len);
02411
02412 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
02413 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
02414 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
02415
02416 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
02417
02418 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
02419 int32_t max, unsigned int len);
02420 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
02421 int32_t max, unsigned int len);
02422 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
02423 int32_t max, unsigned int len);
02424 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
02425 int32_t max, unsigned int len);
02426
02427 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
02428 const float *src1, int len);
02429 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
02430 const float *src1, int len);
02431
02432 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
02433 {
02434 int mm_flags = av_get_cpu_flags();
02435 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02436 const int bit_depth = avctx->bits_per_raw_sample;
02437
02438 if (avctx->dsp_mask) {
02439 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
02440 mm_flags |= (avctx->dsp_mask & 0xffff);
02441 else
02442 mm_flags &= ~(avctx->dsp_mask & 0xffff);
02443 }
02444
02445 #if 0
02446 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
02447 if (mm_flags & AV_CPU_FLAG_MMX)
02448 av_log(avctx, AV_LOG_INFO, " mmx");
02449 if (mm_flags & AV_CPU_FLAG_MMX2)
02450 av_log(avctx, AV_LOG_INFO, " mmx2");
02451 if (mm_flags & AV_CPU_FLAG_3DNOW)
02452 av_log(avctx, AV_LOG_INFO, " 3dnow");
02453 if (mm_flags & AV_CPU_FLAG_SSE)
02454 av_log(avctx, AV_LOG_INFO, " sse");
02455 if (mm_flags & AV_CPU_FLAG_SSE2)
02456 av_log(avctx, AV_LOG_INFO, " sse2");
02457 av_log(avctx, AV_LOG_INFO, "\n");
02458 #endif
02459
02460 if (mm_flags & AV_CPU_FLAG_MMX) {
02461 const int idct_algo= avctx->idct_algo;
02462
02463 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
02464 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
02465 c->idct_put= ff_simple_idct_put_mmx;
02466 c->idct_add= ff_simple_idct_add_mmx;
02467 c->idct = ff_simple_idct_mmx;
02468 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
02469 #if CONFIG_GPL
02470 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
02471 if(mm_flags & AV_CPU_FLAG_MMX2){
02472 c->idct_put= ff_libmpeg2mmx2_idct_put;
02473 c->idct_add= ff_libmpeg2mmx2_idct_add;
02474 c->idct = ff_mmxext_idct;
02475 }else{
02476 c->idct_put= ff_libmpeg2mmx_idct_put;
02477 c->idct_add= ff_libmpeg2mmx_idct_add;
02478 c->idct = ff_mmx_idct;
02479 }
02480 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
02481 #endif
02482 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
02483 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
02484 if(mm_flags & AV_CPU_FLAG_SSE2){
02485 c->idct_put= ff_vp3_idct_put_sse2;
02486 c->idct_add= ff_vp3_idct_add_sse2;
02487 c->idct = ff_vp3_idct_sse2;
02488 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02489 }else{
02490 c->idct_put= ff_vp3_idct_put_mmx;
02491 c->idct_add= ff_vp3_idct_add_mmx;
02492 c->idct = ff_vp3_idct_mmx;
02493 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
02494 }
02495 }else if(idct_algo==FF_IDCT_CAVS){
02496 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02497 }else if(idct_algo==FF_IDCT_XVIDMMX){
02498 if(mm_flags & AV_CPU_FLAG_SSE2){
02499 c->idct_put= ff_idct_xvid_sse2_put;
02500 c->idct_add= ff_idct_xvid_sse2_add;
02501 c->idct = ff_idct_xvid_sse2;
02502 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
02503 }else if(mm_flags & AV_CPU_FLAG_MMX2){
02504 c->idct_put= ff_idct_xvid_mmx2_put;
02505 c->idct_add= ff_idct_xvid_mmx2_add;
02506 c->idct = ff_idct_xvid_mmx2;
02507 }else{
02508 c->idct_put= ff_idct_xvid_mmx_put;
02509 c->idct_add= ff_idct_xvid_mmx_add;
02510 c->idct = ff_idct_xvid_mmx;
02511 }
02512 }
02513 }
02514
02515 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
02516 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
02517 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
02518 if (!high_bit_depth) {
02519 c->clear_block = clear_block_mmx;
02520 c->clear_blocks = clear_blocks_mmx;
02521 if ((mm_flags & AV_CPU_FLAG_SSE) &&
02522 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
02523
02524 c->clear_block = clear_block_sse;
02525 c->clear_blocks = clear_blocks_sse;
02526 }
02527 }
02528
02529 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02530 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
02531 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
02532 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
02533 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
02534
02535 if (!high_bit_depth) {
02536 SET_HPEL_FUNCS(put, 0, 16, mmx);
02537 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
02538 SET_HPEL_FUNCS(avg, 0, 16, mmx);
02539 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
02540 SET_HPEL_FUNCS(put, 1, 8, mmx);
02541 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
02542 SET_HPEL_FUNCS(avg, 1, 8, mmx);
02543 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
02544 }
02545
02546 #if ARCH_X86_32 || !HAVE_YASM
02547 c->gmc= gmc_mmx;
02548 #endif
02549 #if ARCH_X86_32 && HAVE_YASM
02550 if (!high_bit_depth)
02551 c->emulated_edge_mc = emulated_edge_mc_mmx;
02552 #endif
02553
02554 c->add_bytes= add_bytes_mmx;
02555 c->add_bytes_l2= add_bytes_l2_mmx;
02556
02557 if (!high_bit_depth)
02558 c->draw_edges = draw_edges_mmx;
02559
02560 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02561 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
02562 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
02563 }
02564
02565 #if HAVE_YASM
02566 if (!high_bit_depth && CONFIG_H264CHROMA) {
02567 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
02568 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
02569 }
02570
02571 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
02572 #endif
02573
02574 if (mm_flags & AV_CPU_FLAG_MMX2) {
02575 c->prefetch = prefetch_mmx2;
02576
02577 if (!high_bit_depth) {
02578 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
02579 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
02580
02581 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
02582 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
02583 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
02584
02585 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
02586 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
02587
02588 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
02589 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
02590 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
02591 }
02592
02593 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02594 if (!high_bit_depth) {
02595 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
02596 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
02597 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
02598 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
02599 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
02600 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
02601 }
02602
02603 if (CONFIG_VP3_DECODER && HAVE_YASM) {
02604 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
02605 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
02606 }
02607 }
02608 if (CONFIG_VP3_DECODER && HAVE_YASM) {
02609 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
02610 }
02611
02612 if (CONFIG_VP3_DECODER
02613 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
02614 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
02615 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
02616 }
02617
02618 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
02619 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
02620 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
02621 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
02622 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
02623 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
02624 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
02625 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
02626 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
02627 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
02628 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
02629 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
02630 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
02631 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
02632 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
02633 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
02634 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
02635
02636 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
02637 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
02638 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
02639 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
02640 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
02641 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
02642
02643 if (!high_bit_depth) {
02644 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
02645 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
02646 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
02647 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
02648 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
02649 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
02650 }
02651 else if (bit_depth == 10) {
02652 #if HAVE_YASM
02653 #if !ARCH_X86_64
02654 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
02655 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
02656 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
02657 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
02658 #endif
02659 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
02660 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
02661 #endif
02662 }
02663
02664 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
02665 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
02666 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
02667 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
02668
02669 #if HAVE_YASM
02670 if (!high_bit_depth && CONFIG_H264CHROMA) {
02671 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
02672 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
02673 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
02674 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
02675 }
02676 if (bit_depth == 10 && CONFIG_H264CHROMA) {
02677 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
02678 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
02679 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
02680 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
02681 }
02682
02683 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
02684 #endif
02685 #if HAVE_7REGS
02686 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW) &&
02687 (mm_flags & AV_CPU_FLAG_CMOV))
02688 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
02689 #endif
02690
02691 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
02692 } else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
02693 c->prefetch = prefetch_3dnow;
02694
02695 if (!high_bit_depth) {
02696 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02697 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02698
02699 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02700 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02701 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02702
02703 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02704 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02705
02706 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02707 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02708 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02709
02710 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02711 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02712 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02713 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02714 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02715 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02716 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02717 }
02718 }
02719
02720 if (CONFIG_VP3_DECODER
02721 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
02722 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
02723 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
02724 }
02725
02726 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
02727 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
02728 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
02729 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
02730 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
02731 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
02732
02733 if (!high_bit_depth) {
02734 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
02735 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
02736 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
02737 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
02738 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
02739 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
02740 }
02741
02742 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
02743 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
02744 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
02745 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
02746
02747 #if HAVE_YASM
02748 if (!high_bit_depth && CONFIG_H264CHROMA) {
02749 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
02750 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
02751 }
02752
02753 #endif
02754 }
02755
02756
02757 #define H264_QPEL_FUNCS(x, y, CPU)\
02758 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
02759 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
02760 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
02761 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
02762 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
02763
02764 if (!high_bit_depth) {
02765 c->put_pixels_tab[0][0] = put_pixels16_sse2;
02766 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
02767 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
02768 H264_QPEL_FUNCS(0, 0, sse2);
02769 }
02770 }
02771 if(mm_flags & AV_CPU_FLAG_SSE2){
02772 if (!high_bit_depth) {
02773 H264_QPEL_FUNCS(0, 1, sse2);
02774 H264_QPEL_FUNCS(0, 2, sse2);
02775 H264_QPEL_FUNCS(0, 3, sse2);
02776 H264_QPEL_FUNCS(1, 1, sse2);
02777 H264_QPEL_FUNCS(1, 2, sse2);
02778 H264_QPEL_FUNCS(1, 3, sse2);
02779 H264_QPEL_FUNCS(2, 1, sse2);
02780 H264_QPEL_FUNCS(2, 2, sse2);
02781 H264_QPEL_FUNCS(2, 3, sse2);
02782 H264_QPEL_FUNCS(3, 1, sse2);
02783 H264_QPEL_FUNCS(3, 2, sse2);
02784 H264_QPEL_FUNCS(3, 3, sse2);
02785 }
02786 #if HAVE_YASM
02787 #define H264_QPEL_FUNCS_10(x, y, CPU)\
02788 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
02789 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
02790 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
02791 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
02792 if (bit_depth == 10) {
02793 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
02794 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
02795 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
02796 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
02797 H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
02798 H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
02799 H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
02800
02801 if (CONFIG_H264CHROMA) {
02802 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
02803 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
02804 }
02805 }
02806 #endif
02807 }
02808 #if HAVE_SSSE3
02809 if(mm_flags & AV_CPU_FLAG_SSSE3){
02810 if (!high_bit_depth) {
02811 H264_QPEL_FUNCS(1, 0, ssse3);
02812 H264_QPEL_FUNCS(1, 1, ssse3);
02813 H264_QPEL_FUNCS(1, 2, ssse3);
02814 H264_QPEL_FUNCS(1, 3, ssse3);
02815 H264_QPEL_FUNCS(2, 0, ssse3);
02816 H264_QPEL_FUNCS(2, 1, ssse3);
02817 H264_QPEL_FUNCS(2, 2, ssse3);
02818 H264_QPEL_FUNCS(2, 3, ssse3);
02819 H264_QPEL_FUNCS(3, 0, ssse3);
02820 H264_QPEL_FUNCS(3, 1, ssse3);
02821 H264_QPEL_FUNCS(3, 2, ssse3);
02822 H264_QPEL_FUNCS(3, 3, ssse3);
02823 }
02824 #if HAVE_YASM
02825 else if (bit_depth == 10) {
02826 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
02827 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
02828 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
02829 }
02830 #endif
02831 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
02832 #if HAVE_YASM
02833 if (!high_bit_depth && CONFIG_H264CHROMA) {
02834 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
02835 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
02836 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
02837 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
02838 }
02839 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
02840 if (mm_flags & AV_CPU_FLAG_SSE4)
02841 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
02842 #endif
02843 }
02844 #endif
02845
02846 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
02847 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
02848 c->vector_fmul = vector_fmul_3dnow;
02849 }
02850 if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) {
02851 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
02852 #if HAVE_6REGS
02853 c->vector_fmul_window = vector_fmul_window_3dnow2;
02854 #endif
02855 }
02856 if(mm_flags & AV_CPU_FLAG_MMX2){
02857 #if HAVE_YASM
02858 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
02859 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
02860 if (avctx->flags & CODEC_FLAG_BITEXACT) {
02861 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
02862 } else {
02863 c->apply_window_int16 = ff_apply_window_int16_mmxext;
02864 }
02865 #endif
02866 }
02867 if(mm_flags & AV_CPU_FLAG_SSE){
02868 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
02869 c->ac3_downmix = ac3_downmix_sse;
02870 c->vector_fmul = vector_fmul_sse;
02871 c->vector_fmul_reverse = vector_fmul_reverse_sse;
02872 c->vector_fmul_add = vector_fmul_add_sse;
02873 #if HAVE_6REGS
02874 c->vector_fmul_window = vector_fmul_window_sse;
02875 #endif
02876 c->vector_clipf = vector_clipf_sse;
02877 #if HAVE_YASM
02878 c->scalarproduct_float = ff_scalarproduct_float_sse;
02879 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
02880
02881 if (!high_bit_depth)
02882 c->emulated_edge_mc = emulated_edge_mc_sse;
02883 c->gmc = gmc_sse;
02884 #endif
02885 }
02886 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
02887 c->vector_fmul_add = vector_fmul_add_3dnow;
02888 if(mm_flags & AV_CPU_FLAG_SSE2){
02889 #if HAVE_YASM
02890 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
02891 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
02892 if (mm_flags & AV_CPU_FLAG_ATOM) {
02893 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
02894 } else {
02895 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
02896 }
02897 if (avctx->flags & CODEC_FLAG_BITEXACT) {
02898 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
02899 } else {
02900 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02901 c->apply_window_int16 = ff_apply_window_int16_sse2;
02902 }
02903 }
02904 #endif
02905 }
02906 if (mm_flags & AV_CPU_FLAG_SSSE3) {
02907 #if HAVE_YASM
02908 if (mm_flags & AV_CPU_FLAG_ATOM) {
02909 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
02910 } else {
02911 c->apply_window_int16 = ff_apply_window_int16_ssse3;
02912 }
02913 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) {
02914 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
02915 }
02916 #endif
02917 }
02918
02919 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
02920 #if HAVE_YASM
02921 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
02922 #endif
02923 }
02924
02925 #if HAVE_AVX && HAVE_YASM
02926 if (mm_flags & AV_CPU_FLAG_AVX) {
02927 if (bit_depth == 10) {
02928
02929
02930 H264_QPEL_FUNCS_10(1, 0, sse2)
02931 H264_QPEL_FUNCS_10(2, 0, sse2)
02932 H264_QPEL_FUNCS_10(3, 0, sse2)
02933
02934 if (CONFIG_H264CHROMA) {
02935 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
02936 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
02937 }
02938 }
02939 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
02940 }
02941 #endif
02942 }
02943
02944 if (CONFIG_ENCODERS)
02945 dsputilenc_init_mmx(c, avctx);
02946 }