Libav 0.7.1
|
00001 /* 00002 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 00003 * 00004 * This file is part of Libav. 00005 * 00006 * Libav is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * Libav is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with Libav; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 #include "libavutil/cpu.h" 00022 #include "libavutil/x86_cpu.h" 00023 #include "libavcodec/h264dsp.h" 00024 #include "dsputil_mmx.h" 00025 00026 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; 00027 00028 /***********************************/ 00029 /* IDCT */ 00030 #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ 00031 void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride); 00032 00033 IDCT_ADD_FUNC(, 8, mmx) 00034 IDCT_ADD_FUNC(, 10, sse2) 00035 IDCT_ADD_FUNC(_dc, 8, mmx2) 00036 IDCT_ADD_FUNC(_dc, 10, mmx2) 00037 IDCT_ADD_FUNC(8_dc, 8, mmx2) 00038 IDCT_ADD_FUNC(8_dc, 10, sse2) 00039 IDCT_ADD_FUNC(8, 8, mmx) 00040 IDCT_ADD_FUNC(8, 8, sse2) 00041 IDCT_ADD_FUNC(8, 10, sse2) 00042 #if HAVE_AVX 00043 IDCT_ADD_FUNC(, 10, avx) 00044 IDCT_ADD_FUNC(8_dc, 10, avx) 00045 IDCT_ADD_FUNC(8, 10, avx) 00046 #endif 00047 00048 00049 #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ 00050 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ 00051 (uint8_t *dst, const int *block_offset, \ 00052 DCTELEM *block, int stride, const uint8_t nnzc[6*8]); 00053 00054 IDCT_ADD_REP_FUNC(8, 4, 8, mmx) 00055 IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) 00056 IDCT_ADD_REP_FUNC(8, 4, 8, sse2) 00057 IDCT_ADD_REP_FUNC(8, 4, 10, sse2) 00058 IDCT_ADD_REP_FUNC(8, 4, 10, avx) 00059 IDCT_ADD_REP_FUNC(, 16, 8, mmx) 00060 IDCT_ADD_REP_FUNC(, 16, 8, mmx2) 00061 IDCT_ADD_REP_FUNC(, 16, 8, sse2) 00062 IDCT_ADD_REP_FUNC(, 16, 10, sse2) 00063 IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) 00064 IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2) 00065 IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) 00066 IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) 00067 #if HAVE_AVX 00068 IDCT_ADD_REP_FUNC(, 16, 10, avx) 00069 IDCT_ADD_REP_FUNC(, 16intra, 10, avx) 00070 #endif 00071 00072 00073 #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ 00074 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ 00075 (uint8_t **dst, const int *block_offset, \ 00076 DCTELEM *block, int stride, const uint8_t nnzc[6*8]); 00077 IDCT_ADD_REP_FUNC2(, 8, 8, mmx) 00078 IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) 00079 IDCT_ADD_REP_FUNC2(, 8, 8, sse2) 00080 IDCT_ADD_REP_FUNC2(, 8, 10, sse2) 00081 #if HAVE_AVX 00082 IDCT_ADD_REP_FUNC2(, 8, 10, avx) 00083 #endif 00084 00085 void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); 00086 void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); 00087 00088 /***********************************/ 00089 /* deblocking */ 00090 00091 #define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \ 00092 do { \ 00093 x86_reg b_idx; \ 00094 mask_mv <<= 3; \ 00095 for( b_idx=0; b_idx<edges; b_idx+=step ) { \ 00096 if (!mask_dir) \ 00097 __asm__ volatile( \ 00098 "pxor %%mm0, %%mm0 \n\t" \ 00099 :: \ 00100 ); \ 00101 if(!(mask_mv & b_idx)) { \ 00102 if(bidir) { \ 00103 __asm__ volatile( \ 00104 "movd %a3(%0,%2), %%mm2 \n" \ 00105 "punpckldq %a4(%0,%2), %%mm2 \n" /* { ref0[bn], ref1[bn] } */ \ 00106 "pshufw $0x44, 12(%0,%2), %%mm0 \n" /* { ref0[b], ref0[b] } */ \ 00107 "pshufw $0x44, 52(%0,%2), %%mm1 \n" /* { ref1[b], ref1[b] } */ \ 00108 "pshufw $0x4E, %%mm2, %%mm3 \n" \ 00109 "psubb %%mm2, %%mm0 \n" /* { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } */ \ 00110 "psubb %%mm3, %%mm1 \n" /* { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } */ \ 00111 \ 00112 "por %%mm1, %%mm0 \n" \ 00113 "movq %a5(%1,%2,4), %%mm1 \n" \ 00114 "movq %a6(%1,%2,4), %%mm2 \n" \ 00115 "movq %%mm1, %%mm3 \n" \ 00116 "movq %%mm2, %%mm4 \n" \ 00117 "psubw 48(%1,%2,4), %%mm1 \n" \ 00118 "psubw 56(%1,%2,4), %%mm2 \n" \ 00119 "psubw 208(%1,%2,4), %%mm3 \n" \ 00120 "psubw 216(%1,%2,4), %%mm4 \n" \ 00121 "packsswb %%mm2, %%mm1 \n" \ 00122 "packsswb %%mm4, %%mm3 \n" \ 00123 "paddb %%mm6, %%mm1 \n" \ 00124 "paddb %%mm6, %%mm3 \n" \ 00125 "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ 00126 "psubusb %%mm5, %%mm3 \n" \ 00127 "packsswb %%mm3, %%mm1 \n" \ 00128 \ 00129 "por %%mm1, %%mm0 \n" \ 00130 "movq %a7(%1,%2,4), %%mm1 \n" \ 00131 "movq %a8(%1,%2,4), %%mm2 \n" \ 00132 "movq %%mm1, %%mm3 \n" \ 00133 "movq %%mm2, %%mm4 \n" \ 00134 "psubw 48(%1,%2,4), %%mm1 \n" \ 00135 "psubw 56(%1,%2,4), %%mm2 \n" \ 00136 "psubw 208(%1,%2,4), %%mm3 \n" \ 00137 "psubw 216(%1,%2,4), %%mm4 \n" \ 00138 "packsswb %%mm2, %%mm1 \n" \ 00139 "packsswb %%mm4, %%mm3 \n" \ 00140 "paddb %%mm6, %%mm1 \n" \ 00141 "paddb %%mm6, %%mm3 \n" \ 00142 "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ 00143 "psubusb %%mm5, %%mm3 \n" \ 00144 "packsswb %%mm3, %%mm1 \n" \ 00145 \ 00146 "pshufw $0x4E, %%mm1, %%mm1 \n" \ 00147 "por %%mm1, %%mm0 \n" \ 00148 "pshufw $0x4E, %%mm0, %%mm1 \n" \ 00149 "pminub %%mm1, %%mm0 \n" \ 00150 ::"r"(ref), \ 00151 "r"(mv), \ 00152 "r"(b_idx), \ 00153 "i"(d_idx+12), \ 00154 "i"(d_idx+52), \ 00155 "i"(d_idx*4+48), \ 00156 "i"(d_idx*4+56), \ 00157 "i"(d_idx*4+208), \ 00158 "i"(d_idx*4+216) \ 00159 ); \ 00160 } else { \ 00161 __asm__ volatile( \ 00162 "movd 12(%0,%2), %%mm0 \n" \ 00163 "psubb %a3(%0,%2), %%mm0 \n" /* ref[b] != ref[bn] */ \ 00164 "movq 48(%1,%2,4), %%mm1 \n" \ 00165 "movq 56(%1,%2,4), %%mm2 \n" \ 00166 "psubw %a4(%1,%2,4), %%mm1 \n" \ 00167 "psubw %a5(%1,%2,4), %%mm2 \n" \ 00168 "packsswb %%mm2, %%mm1 \n" \ 00169 "paddb %%mm6, %%mm1 \n" \ 00170 "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ 00171 "packsswb %%mm1, %%mm1 \n" \ 00172 "por %%mm1, %%mm0 \n" \ 00173 ::"r"(ref), \ 00174 "r"(mv), \ 00175 "r"(b_idx), \ 00176 "i"(d_idx+12), \ 00177 "i"(d_idx*4+48), \ 00178 "i"(d_idx*4+56) \ 00179 ); \ 00180 } \ 00181 } \ 00182 __asm__ volatile( \ 00183 "movd 12(%0,%1), %%mm1 \n" \ 00184 "por %a2(%0,%1), %%mm1 \n" /* nnz[b] || nnz[bn] */ \ 00185 ::"r"(nnz), \ 00186 "r"(b_idx), \ 00187 "i"(d_idx+12) \ 00188 ); \ 00189 __asm__ volatile( \ 00190 "pminub %%mm7, %%mm1 \n" \ 00191 "pminub %%mm7, %%mm0 \n" \ 00192 "psllw $1, %%mm1 \n" \ 00193 "pxor %%mm2, %%mm2 \n" \ 00194 "pmaxub %%mm0, %%mm1 \n" \ 00195 "punpcklbw %%mm2, %%mm1 \n" \ 00196 "movq %%mm1, %a1(%0,%2) \n" \ 00197 ::"r"(bS), \ 00198 "i"(32*dir), \ 00199 "r"(b_idx) \ 00200 :"memory" \ 00201 ); \ 00202 } \ 00203 } while (0) 00204 00205 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 00206 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 00207 __asm__ volatile( 00208 "movq %0, %%mm7 \n" 00209 "movq %1, %%mm6 \n" 00210 ::"m"(ff_pb_1), "m"(ff_pb_3) 00211 ); 00212 if(field) 00213 __asm__ volatile( 00214 "movq %0, %%mm6 \n" 00215 ::"m"(ff_pb_3_1) 00216 ); 00217 __asm__ volatile( 00218 "movq %%mm6, %%mm5 \n" 00219 "paddb %%mm5, %%mm5 \n" 00220 :); 00221 00222 // could do a special case for dir==0 && edges==1, but it only reduces the 00223 // average filter time by 1.2% 00224 step <<= 3; 00225 edges <<= 3; 00226 h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, edges, step, mask_mv1, 1, -8, 0); 00227 h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, 32, 8, mask_mv0, 0, -1, -1); 00228 00229 __asm__ volatile( 00230 "movq (%0), %%mm0 \n\t" 00231 "movq 8(%0), %%mm1 \n\t" 00232 "movq 16(%0), %%mm2 \n\t" 00233 "movq 24(%0), %%mm3 \n\t" 00234 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) 00235 "movq %%mm0, (%0) \n\t" 00236 "movq %%mm3, 8(%0) \n\t" 00237 "movq %%mm4, 16(%0) \n\t" 00238 "movq %%mm2, 24(%0) \n\t" 00239 ::"r"(bS[0]) 00240 :"memory" 00241 ); 00242 } 00243 00244 #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ 00245 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ 00246 int alpha, int beta, int8_t *tc0); 00247 #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ 00248 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ 00249 int alpha, int beta); 00250 00251 #define LF_FUNCS(type, depth)\ 00252 LF_FUNC (h, chroma, depth, mmxext)\ 00253 LF_IFUNC(h, chroma_intra, depth, mmxext)\ 00254 LF_FUNC (v, chroma, depth, mmxext)\ 00255 LF_IFUNC(v, chroma_intra, depth, mmxext)\ 00256 LF_FUNC (h, luma, depth, mmxext)\ 00257 LF_IFUNC(h, luma_intra, depth, mmxext)\ 00258 LF_FUNC (h, luma, depth, sse2)\ 00259 LF_IFUNC(h, luma_intra, depth, sse2)\ 00260 LF_FUNC (v, luma, depth, sse2)\ 00261 LF_IFUNC(v, luma_intra, depth, sse2)\ 00262 LF_FUNC (h, chroma, depth, sse2)\ 00263 LF_IFUNC(h, chroma_intra, depth, sse2)\ 00264 LF_FUNC (v, chroma, depth, sse2)\ 00265 LF_IFUNC(v, chroma_intra, depth, sse2)\ 00266 LF_FUNC (h, luma, depth, avx)\ 00267 LF_IFUNC(h, luma_intra, depth, avx)\ 00268 LF_FUNC (v, luma, depth, avx)\ 00269 LF_IFUNC(v, luma_intra, depth, avx)\ 00270 LF_FUNC (h, chroma, depth, avx)\ 00271 LF_IFUNC(h, chroma_intra, depth, avx)\ 00272 LF_FUNC (v, chroma, depth, avx)\ 00273 LF_IFUNC(v, chroma_intra, depth, avx) 00274 00275 LF_FUNCS( uint8_t, 8) 00276 LF_FUNCS(uint16_t, 10) 00277 00278 #if ARCH_X86_32 00279 LF_FUNC (v8, luma, 8, mmxext) 00280 static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 00281 { 00282 if((tc0[0] & tc0[1]) >= 0) 00283 ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0); 00284 if((tc0[2] & tc0[3]) >= 0) 00285 ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2); 00286 } 00287 LF_IFUNC(v8, luma_intra, 8, mmxext) 00288 static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta) 00289 { 00290 ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta); 00291 ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta); 00292 } 00293 #endif /* ARCH_X86_32 */ 00294 00295 LF_FUNC (v, luma, 10, mmxext) 00296 LF_IFUNC(v, luma_intra, 10, mmxext) 00297 00298 /***********************************/ 00299 /* weighted prediction */ 00300 00301 #define H264_WEIGHT(W, H, OPT) \ 00302 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ 00303 int stride, int log2_denom, int weight, int offset); 00304 00305 #define H264_BIWEIGHT(W, H, OPT) \ 00306 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ 00307 uint8_t *src, int stride, int log2_denom, int weightd, \ 00308 int weights, int offset); 00309 00310 #define H264_BIWEIGHT_MMX(W,H) \ 00311 H264_WEIGHT (W, H, mmx2) \ 00312 H264_BIWEIGHT(W, H, mmx2) 00313 00314 #define H264_BIWEIGHT_MMX_SSE(W,H) \ 00315 H264_BIWEIGHT_MMX(W, H) \ 00316 H264_WEIGHT (W, H, sse2) \ 00317 H264_BIWEIGHT (W, H, sse2) \ 00318 H264_BIWEIGHT (W, H, ssse3) 00319 00320 H264_BIWEIGHT_MMX_SSE(16, 16) 00321 H264_BIWEIGHT_MMX_SSE(16, 8) 00322 H264_BIWEIGHT_MMX_SSE( 8, 16) 00323 H264_BIWEIGHT_MMX_SSE( 8, 8) 00324 H264_BIWEIGHT_MMX_SSE( 8, 4) 00325 H264_BIWEIGHT_MMX ( 4, 8) 00326 H264_BIWEIGHT_MMX ( 4, 4) 00327 H264_BIWEIGHT_MMX ( 4, 2) 00328 00329 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) 00330 { 00331 int mm_flags = av_get_cpu_flags(); 00332 00333 if (bit_depth == 8) { 00334 if (mm_flags & AV_CPU_FLAG_MMX2) { 00335 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; 00336 } 00337 #if HAVE_YASM 00338 if (mm_flags & AV_CPU_FLAG_MMX) { 00339 c->h264_idct_dc_add = 00340 c->h264_idct_add = ff_h264_idct_add_8_mmx; 00341 c->h264_idct8_dc_add = 00342 c->h264_idct8_add = ff_h264_idct8_add_8_mmx; 00343 00344 c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; 00345 c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; 00346 c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; 00347 c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; 00348 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; 00349 00350 if (mm_flags & AV_CPU_FLAG_MMX2) { 00351 c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; 00352 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; 00353 c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; 00354 c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; 00355 c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; 00356 c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2; 00357 00358 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext; 00359 c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; 00360 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext; 00361 c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext; 00362 #if ARCH_X86_32 00363 c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext; 00364 c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext; 00365 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; 00366 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; 00367 #endif 00368 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; 00369 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; 00370 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; 00371 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; 00372 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; 00373 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; 00374 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; 00375 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; 00376 00377 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; 00378 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; 00379 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; 00380 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; 00381 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; 00382 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; 00383 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; 00384 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; 00385 00386 if (mm_flags&AV_CPU_FLAG_SSE2) { 00387 c->h264_idct8_add = ff_h264_idct8_add_8_sse2; 00388 00389 c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; 00390 c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; 00391 c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; 00392 c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; 00393 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; 00394 00395 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; 00396 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; 00397 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; 00398 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; 00399 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; 00400 00401 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; 00402 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; 00403 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; 00404 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; 00405 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; 00406 00407 #if HAVE_ALIGNED_STACK 00408 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; 00409 c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; 00410 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; 00411 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; 00412 #endif 00413 } 00414 if (mm_flags&AV_CPU_FLAG_SSSE3) { 00415 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; 00416 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; 00417 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; 00418 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; 00419 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; 00420 } 00421 if (mm_flags&AV_CPU_FLAG_AVX) { 00422 #if HAVE_ALIGNED_STACK 00423 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; 00424 c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; 00425 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; 00426 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; 00427 #endif 00428 } 00429 } 00430 } 00431 #endif 00432 } else if (bit_depth == 10) { 00433 #if HAVE_YASM 00434 if (mm_flags & AV_CPU_FLAG_MMX) { 00435 if (mm_flags & AV_CPU_FLAG_MMX2) { 00436 #if ARCH_X86_32 00437 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext; 00438 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext; 00439 c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext; 00440 c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext; 00441 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; 00442 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; 00443 #endif 00444 c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2; 00445 if (mm_flags&AV_CPU_FLAG_SSE2) { 00446 c->h264_idct_add = ff_h264_idct_add_10_sse2; 00447 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; 00448 00449 c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; 00450 c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; 00451 c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2; 00452 #if HAVE_ALIGNED_STACK 00453 c->h264_idct8_add = ff_h264_idct8_add_10_sse2; 00454 c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; 00455 #endif 00456 00457 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; 00458 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; 00459 #if HAVE_ALIGNED_STACK 00460 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; 00461 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; 00462 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; 00463 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; 00464 #endif 00465 } 00466 #if HAVE_AVX 00467 if (mm_flags&AV_CPU_FLAG_AVX) { 00468 c->h264_idct_dc_add = 00469 c->h264_idct_add = ff_h264_idct_add_10_avx; 00470 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; 00471 00472 c->h264_idct_add16 = ff_h264_idct_add16_10_avx; 00473 c->h264_idct_add8 = ff_h264_idct_add8_10_avx; 00474 c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx; 00475 #if HAVE_ALIGNED_STACK 00476 c->h264_idct8_add = ff_h264_idct8_add_10_avx; 00477 c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; 00478 #endif 00479 00480 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; 00481 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; 00482 #if HAVE_ALIGNED_STACK 00483 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; 00484 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; 00485 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; 00486 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; 00487 #endif 00488 } 00489 #endif /* HAVE_AVX */ 00490 } 00491 } 00492 #endif 00493 } 00494 }