Libav
|
00001 /* 00002 * Windows Media Audio Voice decoder. 00003 * Copyright (c) 2009 Ronald S. Bultje 00004 * 00005 * This file is part of FFmpeg. 00006 * 00007 * FFmpeg is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * FFmpeg is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with FFmpeg; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00028 #include <math.h> 00029 #include "avcodec.h" 00030 #include "get_bits.h" 00031 #include "put_bits.h" 00032 #include "wmavoice_data.h" 00033 #include "celp_math.h" 00034 #include "celp_filters.h" 00035 #include "acelp_vectors.h" 00036 #include "acelp_filters.h" 00037 #include "lsp.h" 00038 #include "libavutil/lzo.h" 00039 #include "avfft.h" 00040 #include "fft.h" 00041 00042 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame 00043 #define MAX_LSPS 16 ///< maximum filter order 00044 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple 00045 00046 #define MAX_FRAMES 3 ///< maximum number of frames per superframe 00047 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame 00048 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history 00049 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) 00050 00051 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that 00052 00053 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration 00054 00058 static VLC frame_type_vlc; 00059 00063 enum { 00064 ACB_TYPE_NONE = 0, 00065 ACB_TYPE_ASYMMETRIC = 1, 00066 00067 00068 00069 00070 ACB_TYPE_HAMMING = 2 00071 00072 00073 }; 00074 00078 enum { 00079 FCB_TYPE_SILENCE = 0, 00080 00081 00082 FCB_TYPE_HARDCODED = 1, 00083 00084 FCB_TYPE_AW_PULSES = 2, 00085 00086 FCB_TYPE_EXC_PULSES = 3, 00087 00088 00089 }; 00090 00094 static const struct frame_type_desc { 00095 uint8_t n_blocks; 00096 00097 uint8_t log_n_blocks; 00098 uint8_t acb_type; 00099 uint8_t fcb_type; 00100 uint8_t dbl_pulses; 00101 00102 00103 uint16_t frame_size; 00104 00105 } frame_descs[17] = { 00106 { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 }, 00107 { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 }, 00108 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 }, 00109 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 }, 00110 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 }, 00111 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 }, 00112 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 }, 00113 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 }, 00114 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 }, 00115 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 }, 00116 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 }, 00117 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 }, 00118 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 }, 00119 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 }, 00120 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 }, 00121 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 }, 00122 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 } 00123 }; 00124 00128 typedef struct { 00135 GetBitContext gb; 00136 00137 00138 00139 int8_t vbm_tree[25]; 00140 00141 int spillover_bitsize; 00142 00143 00144 int history_nsamples; 00145 00146 00147 /* postfilter specific values */ 00148 int do_apf; 00149 00150 int denoise_strength; 00151 00152 int denoise_tilt_corr; 00153 00154 int dc_level; 00155 00156 00157 int lsps; 00158 int lsp_q_mode; 00159 int lsp_def_mode; 00160 00161 int frame_lsp_bitsize; 00162 00163 int sframe_lsp_bitsize; 00164 00165 00166 int min_pitch_val; 00167 int max_pitch_val; 00168 int pitch_nbits; 00169 00170 int block_pitch_nbits; 00171 00172 int block_pitch_range; 00173 int block_delta_pitch_nbits; 00174 00175 00176 00177 int block_delta_pitch_hrange; 00178 00179 uint16_t block_conv_table[4]; 00180 00181 00190 int spillover_nbits; 00191 00192 00193 00194 int has_residual_lsps; 00195 00196 00197 00198 00199 int skip_bits_next; 00200 00201 00202 00203 uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE]; 00206 int sframe_cache_size; 00207 00208 00209 00210 00211 PutBitContext pb; 00212 00221 double prev_lsps[MAX_LSPS]; 00222 00223 int last_pitch_val; 00224 int last_acb_type; 00225 int pitch_diff_sh16; 00226 00227 float silence_gain; 00228 00229 int aw_idx_is_ext; 00230 00231 int aw_pulse_range; 00232 00233 00234 00235 00236 00237 int aw_n_pulses[2]; 00238 00239 00240 int aw_first_pulse_off[2]; 00241 00242 int aw_next_pulse_off_cache; 00243 00244 00245 00246 00247 00248 int frame_cntr; 00249 00250 float gain_pred_err[6]; 00251 float excitation_history[MAX_SIGNAL_HISTORY]; 00255 float synth_history[MAX_LSPS]; 00256 00263 RDFTContext rdft, irdft; 00264 00265 DCTContext dct, dst; 00266 00267 float sin[511], cos[511]; 00268 00269 float postfilter_agc; 00270 00271 float dcf_mem[2]; 00272 float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE]; 00275 float denoise_filter_cache[MAX_FRAMESIZE]; 00276 int denoise_filter_cache_size; 00277 DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80]; 00279 DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80]; 00281 DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16]; 00284 00287 } WMAVoiceContext; 00288 00298 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25]) 00299 { 00300 static const uint8_t bits[] = { 00301 2, 2, 2, 4, 4, 4, 00302 6, 6, 6, 8, 8, 8, 00303 10, 10, 10, 12, 12, 12, 00304 14, 14, 14, 14 00305 }; 00306 static const uint16_t codes[] = { 00307 0x0000, 0x0001, 0x0002, // 00/01/10 00308 0x000c, 0x000d, 0x000e, // 11+00/01/10 00309 0x003c, 0x003d, 0x003e, // 1111+00/01/10 00310 0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10 00311 0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10 00312 0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10 00313 0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx 00314 }; 00315 int cntr[8], n, res; 00316 00317 memset(vbm_tree, 0xff, sizeof(vbm_tree)); 00318 memset(cntr, 0, sizeof(cntr)); 00319 for (n = 0; n < 17; n++) { 00320 res = get_bits(gb, 3); 00321 if (cntr[res] > 3) // should be >= 3 + (res == 7)) 00322 return -1; 00323 vbm_tree[res * 3 + cntr[res]++] = n; 00324 } 00325 INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits), 00326 bits, 1, 1, codes, 2, 2, 132); 00327 return 0; 00328 } 00329 00333 static av_cold int wmavoice_decode_init(AVCodecContext *ctx) 00334 { 00335 int n, flags, pitch_range, lsp16_flag; 00336 WMAVoiceContext *s = ctx->priv_data; 00337 00346 if (ctx->extradata_size != 46) { 00347 av_log(ctx, AV_LOG_ERROR, 00348 "Invalid extradata size %d (should be 46)\n", 00349 ctx->extradata_size); 00350 return -1; 00351 } 00352 flags = AV_RL32(ctx->extradata + 18); 00353 s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align); 00354 s->do_apf = flags & 0x1; 00355 if (s->do_apf) { 00356 ff_rdft_init(&s->rdft, 7, DFT_R2C); 00357 ff_rdft_init(&s->irdft, 7, IDFT_C2R); 00358 ff_dct_init(&s->dct, 6, DCT_I); 00359 ff_dct_init(&s->dst, 6, DST_I); 00360 00361 ff_sine_window_init(s->cos, 256); 00362 memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0])); 00363 for (n = 0; n < 255; n++) { 00364 s->sin[n] = -s->sin[510 - n]; 00365 s->cos[510 - n] = s->cos[n]; 00366 } 00367 } 00368 s->denoise_strength = (flags >> 2) & 0xF; 00369 if (s->denoise_strength >= 12) { 00370 av_log(ctx, AV_LOG_ERROR, 00371 "Invalid denoise filter strength %d (max=11)\n", 00372 s->denoise_strength); 00373 return -1; 00374 } 00375 s->denoise_tilt_corr = !!(flags & 0x40); 00376 s->dc_level = (flags >> 7) & 0xF; 00377 s->lsp_q_mode = !!(flags & 0x2000); 00378 s->lsp_def_mode = !!(flags & 0x4000); 00379 lsp16_flag = flags & 0x1000; 00380 if (lsp16_flag) { 00381 s->lsps = 16; 00382 s->frame_lsp_bitsize = 34; 00383 s->sframe_lsp_bitsize = 60; 00384 } else { 00385 s->lsps = 10; 00386 s->frame_lsp_bitsize = 24; 00387 s->sframe_lsp_bitsize = 48; 00388 } 00389 for (n = 0; n < s->lsps; n++) 00390 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0); 00391 00392 init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3); 00393 if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) { 00394 av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n"); 00395 return -1; 00396 } 00397 00398 s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8; 00399 s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8; 00400 pitch_range = s->max_pitch_val - s->min_pitch_val; 00401 s->pitch_nbits = av_ceil_log2(pitch_range); 00402 s->last_pitch_val = 40; 00403 s->last_acb_type = ACB_TYPE_NONE; 00404 s->history_nsamples = s->max_pitch_val + 8; 00405 00406 if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) { 00407 int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8, 00408 max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8; 00409 00410 av_log(ctx, AV_LOG_ERROR, 00411 "Unsupported samplerate %d (min=%d, max=%d)\n", 00412 ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz 00413 00414 return -1; 00415 } 00416 00417 s->block_conv_table[0] = s->min_pitch_val; 00418 s->block_conv_table[1] = (pitch_range * 25) >> 6; 00419 s->block_conv_table[2] = (pitch_range * 44) >> 6; 00420 s->block_conv_table[3] = s->max_pitch_val - 1; 00421 s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF; 00422 s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange); 00423 s->block_pitch_range = s->block_conv_table[2] + 00424 s->block_conv_table[3] + 1 + 00425 2 * (s->block_conv_table[1] - 2 * s->min_pitch_val); 00426 s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range); 00427 00428 ctx->sample_fmt = SAMPLE_FMT_FLT; 00429 00430 return 0; 00431 } 00432 00454 static void adaptive_gain_control(float *out, const float *in, 00455 const float *speech_synth, 00456 int size, float alpha, float *gain_mem) 00457 { 00458 int i; 00459 float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor; 00460 float mem = *gain_mem; 00461 00462 for (i = 0; i < size; i++) { 00463 speech_energy += fabsf(speech_synth[i]); 00464 postfilter_energy += fabsf(in[i]); 00465 } 00466 gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy; 00467 00468 for (i = 0; i < size; i++) { 00469 mem = alpha * mem + gain_scale_factor; 00470 out[i] = in[i] * mem; 00471 } 00472 00473 *gain_mem = mem; 00474 } 00475 00494 static int kalman_smoothen(WMAVoiceContext *s, int pitch, 00495 const float *in, float *out, int size) 00496 { 00497 int n; 00498 float optimal_gain = 0, dot; 00499 const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)], 00500 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)], 00501 *best_hist_ptr; 00502 00503 /* find best fitting point in history */ 00504 do { 00505 dot = ff_dot_productf(in, ptr, size); 00506 if (dot > optimal_gain) { 00507 optimal_gain = dot; 00508 best_hist_ptr = ptr; 00509 } 00510 } while (--ptr >= end); 00511 00512 if (optimal_gain <= 0) 00513 return -1; 00514 dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size); 00515 if (dot <= 0) // would be 1.0 00516 return -1; 00517 00518 if (optimal_gain <= dot) { 00519 dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000 00520 } else 00521 dot = 0.625; 00522 00523 /* actual smoothing */ 00524 for (n = 0; n < size; n++) 00525 out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]); 00526 00527 return 0; 00528 } 00529 00540 static float tilt_factor(const float *lpcs, int n_lpcs) 00541 { 00542 float rh0, rh1; 00543 00544 rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs); 00545 rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1); 00546 00547 return rh1 / rh0; 00548 } 00549 00553 static void calc_input_response(WMAVoiceContext *s, float *lpcs, 00554 int fcb_type, float *coeffs, int remainder) 00555 { 00556 float last_coeff, min = 15.0, max = -15.0; 00557 float irange, angle_mul, gain_mul, range, sq; 00558 int n, idx; 00559 00560 /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */ 00561 ff_rdft_calc(&s->rdft, lpcs); 00562 #define log_range(var, assign) do { \ 00563 float tmp = log10f(assign); var = tmp; \ 00564 max = FFMAX(max, tmp); min = FFMIN(min, tmp); \ 00565 } while (0) 00566 log_range(last_coeff, lpcs[1] * lpcs[1]); 00567 for (n = 1; n < 64; n++) 00568 log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] + 00569 lpcs[n * 2 + 1] * lpcs[n * 2 + 1]); 00570 log_range(lpcs[0], lpcs[0] * lpcs[0]); 00571 #undef log_range 00572 range = max - min; 00573 lpcs[64] = last_coeff; 00574 00575 /* Now, use this spectrum to pick out these frequencies with higher 00576 * (relative) power/energy (which we then take to be "not noise"), 00577 * and set up a table (still in lpc[]) of (relative) gains per frequency. 00578 * These frequencies will be maintained, while others ("noise") will be 00579 * decreased in the filter output. */ 00580 irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63] 00581 gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) : 00582 (5.0 / 14.7)); 00583 angle_mul = gain_mul * (8.0 * M_LN10 / M_PI); 00584 for (n = 0; n <= 64; n++) { 00585 float pow; 00586 00587 idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1); 00588 pow = wmavoice_denoise_power_table[s->denoise_strength][idx]; 00589 lpcs[n] = angle_mul * pow; 00590 00591 /* 70.57 =~ 1/log10(1.0331663) */ 00592 idx = (pow * gain_mul - 0.0295) * 70.570526123; 00593 if (idx > 127) { // fallback if index falls outside table range 00594 coeffs[n] = wmavoice_energy_table[127] * 00595 powf(1.0331663, idx - 127); 00596 } else 00597 coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)]; 00598 } 00599 00600 /* calculate the Hilbert transform of the gains, which we do (since this 00601 * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()). 00602 * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the 00603 * "moment" of the LPCs in this filter. */ 00604 ff_dct_calc(&s->dct, lpcs); 00605 ff_dct_calc(&s->dst, lpcs); 00606 00607 /* Split out the coefficient indexes into phase/magnitude pairs */ 00608 idx = 255 + av_clip(lpcs[64], -255, 255); 00609 coeffs[0] = coeffs[0] * s->cos[idx]; 00610 idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255); 00611 last_coeff = coeffs[64] * s->cos[idx]; 00612 for (n = 63;; n--) { 00613 idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255); 00614 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx]; 00615 coeffs[n * 2] = coeffs[n] * s->cos[idx]; 00616 00617 if (!--n) break; 00618 00619 idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255); 00620 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx]; 00621 coeffs[n * 2] = coeffs[n] * s->cos[idx]; 00622 } 00623 coeffs[1] = last_coeff; 00624 00625 /* move into real domain */ 00626 ff_rdft_calc(&s->irdft, coeffs); 00627 00628 /* tilt correction and normalize scale */ 00629 memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder)); 00630 if (s->denoise_tilt_corr) { 00631 float tilt_mem = 0; 00632 00633 coeffs[remainder - 1] = 0; 00634 ff_tilt_compensation(&tilt_mem, 00635 -1.8 * tilt_factor(coeffs, remainder - 1), 00636 coeffs, remainder); 00637 } 00638 sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder)); 00639 for (n = 0; n < remainder; n++) 00640 coeffs[n] *= sq; 00641 } 00642 00669 static void wiener_denoise(WMAVoiceContext *s, int fcb_type, 00670 float *synth_pf, int size, 00671 const float *lpcs) 00672 { 00673 int remainder, lim, n; 00674 00675 if (fcb_type != FCB_TYPE_SILENCE) { 00676 float *tilted_lpcs = s->tilted_lpcs_pf, 00677 *coeffs = s->denoise_coeffs_pf, tilt_mem = 0; 00678 00679 tilted_lpcs[0] = 1.0; 00680 memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps); 00681 memset(&tilted_lpcs[s->lsps + 1], 0, 00682 sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1)); 00683 ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps), 00684 tilted_lpcs, s->lsps + 2); 00685 00686 /* The IRDFT output (127 samples for 7-bit filter) beyond the frame 00687 * size is applied to the next frame. All input beyond this is zero, 00688 * and thus all output beyond this will go towards zero, hence we can 00689 * limit to min(size-1, 127-size) as a performance consideration. */ 00690 remainder = FFMIN(127 - size, size - 1); 00691 calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder); 00692 00693 /* apply coefficients (in frequency spectrum domain), i.e. complex 00694 * number multiplication */ 00695 memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size)); 00696 ff_rdft_calc(&s->rdft, synth_pf); 00697 ff_rdft_calc(&s->rdft, coeffs); 00698 synth_pf[0] *= coeffs[0]; 00699 synth_pf[1] *= coeffs[1]; 00700 for (n = 1; n < 64; n++) { 00701 float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1]; 00702 synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1]; 00703 synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1]; 00704 } 00705 ff_rdft_calc(&s->irdft, synth_pf); 00706 } 00707 00708 /* merge filter output with the history of previous runs */ 00709 if (s->denoise_filter_cache_size) { 00710 lim = FFMIN(s->denoise_filter_cache_size, size); 00711 for (n = 0; n < lim; n++) 00712 synth_pf[n] += s->denoise_filter_cache[n]; 00713 s->denoise_filter_cache_size -= lim; 00714 memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size], 00715 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size); 00716 } 00717 00718 /* move remainder of filter output into a cache for future runs */ 00719 if (fcb_type != FCB_TYPE_SILENCE) { 00720 lim = FFMIN(remainder, s->denoise_filter_cache_size); 00721 for (n = 0; n < lim; n++) 00722 s->denoise_filter_cache[n] += synth_pf[size + n]; 00723 if (lim < remainder) { 00724 memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim], 00725 sizeof(s->denoise_filter_cache[0]) * (remainder - lim)); 00726 s->denoise_filter_cache_size = remainder; 00727 } 00728 } 00729 } 00730 00750 static void postfilter(WMAVoiceContext *s, const float *synth, 00751 float *samples, int size, 00752 const float *lpcs, float *zero_exc_pf, 00753 int fcb_type, int pitch) 00754 { 00755 float synth_filter_in_buf[MAX_FRAMESIZE / 2], 00756 *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16], 00757 *synth_filter_in = zero_exc_pf; 00758 00759 assert(size <= MAX_FRAMESIZE / 2); 00760 00761 /* generate excitation from input signal */ 00762 ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps); 00763 00764 if (fcb_type >= FCB_TYPE_AW_PULSES && 00765 !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size)) 00766 synth_filter_in = synth_filter_in_buf; 00767 00768 /* re-synthesize speech after smoothening, and keep history */ 00769 ff_celp_lp_synthesis_filterf(synth_pf, lpcs, 00770 synth_filter_in, size, s->lsps); 00771 memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps], 00772 sizeof(synth_pf[0]) * s->lsps); 00773 00774 wiener_denoise(s, fcb_type, synth_pf, size, lpcs); 00775 00776 adaptive_gain_control(samples, synth_pf, synth, size, 0.99, 00777 &s->postfilter_agc); 00778 00779 if (s->dc_level > 8) { 00780 /* remove ultra-low frequency DC noise / highpass filter; 00781 * coefficients are identical to those used in SIPR decoding, 00782 * and very closely resemble those used in AMR-NB decoding. */ 00783 ff_acelp_apply_order_2_transfer_function(samples, samples, 00784 (const float[2]) { -1.99997, 1.0 }, 00785 (const float[2]) { -1.9330735188, 0.93589198496 }, 00786 0.93980580475, s->dcf_mem, size); 00787 } 00788 } 00804 static void dequant_lsps(double *lsps, int num, 00805 const uint16_t *values, 00806 const uint16_t *sizes, 00807 int n_stages, const uint8_t *table, 00808 const double *mul_q, 00809 const double *base_q) 00810 { 00811 int n, m; 00812 00813 memset(lsps, 0, num * sizeof(*lsps)); 00814 for (n = 0; n < n_stages; n++) { 00815 const uint8_t *t_off = &table[values[n] * num]; 00816 double base = base_q[n], mul = mul_q[n]; 00817 00818 for (m = 0; m < num; m++) 00819 lsps[m] += base + mul * t_off[m]; 00820 00821 table += sizes[n] * num; 00822 } 00823 } 00824 00836 static void dequant_lsp10i(GetBitContext *gb, double *lsps) 00837 { 00838 static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 }; 00839 static const double mul_lsf[4] = { 00840 5.2187144800e-3, 1.4626986422e-3, 00841 9.6179549166e-4, 1.1325736225e-3 00842 }; 00843 static const double base_lsf[4] = { 00844 M_PI * -2.15522e-1, M_PI * -6.1646e-2, 00845 M_PI * -3.3486e-2, M_PI * -5.7408e-2 00846 }; 00847 uint16_t v[4]; 00848 00849 v[0] = get_bits(gb, 8); 00850 v[1] = get_bits(gb, 6); 00851 v[2] = get_bits(gb, 5); 00852 v[3] = get_bits(gb, 5); 00853 00854 dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i, 00855 mul_lsf, base_lsf); 00856 } 00857 00862 static void dequant_lsp10r(GetBitContext *gb, 00863 double *i_lsps, const double *old, 00864 double *a1, double *a2, int q_mode) 00865 { 00866 static const uint16_t vec_sizes[3] = { 128, 64, 64 }; 00867 static const double mul_lsf[3] = { 00868 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3 00869 }; 00870 static const double base_lsf[3] = { 00871 M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2 00872 }; 00873 const float (*ipol_tab)[2][10] = q_mode ? 00874 wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a; 00875 uint16_t interpol, v[3]; 00876 int n; 00877 00878 dequant_lsp10i(gb, i_lsps); 00879 00880 interpol = get_bits(gb, 5); 00881 v[0] = get_bits(gb, 7); 00882 v[1] = get_bits(gb, 6); 00883 v[2] = get_bits(gb, 6); 00884 00885 for (n = 0; n < 10; n++) { 00886 double delta = old[n] - i_lsps[n]; 00887 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n]; 00888 a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 00889 } 00890 00891 dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r, 00892 mul_lsf, base_lsf); 00893 } 00894 00898 static void dequant_lsp16i(GetBitContext *gb, double *lsps) 00899 { 00900 static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 }; 00901 static const double mul_lsf[5] = { 00902 3.3439586280e-3, 6.9908173703e-4, 00903 3.3216608306e-3, 1.0334960326e-3, 00904 3.1899104283e-3 00905 }; 00906 static const double base_lsf[5] = { 00907 M_PI * -1.27576e-1, M_PI * -2.4292e-2, 00908 M_PI * -1.28094e-1, M_PI * -3.2128e-2, 00909 M_PI * -1.29816e-1 00910 }; 00911 uint16_t v[5]; 00912 00913 v[0] = get_bits(gb, 8); 00914 v[1] = get_bits(gb, 6); 00915 v[2] = get_bits(gb, 7); 00916 v[3] = get_bits(gb, 6); 00917 v[4] = get_bits(gb, 7); 00918 00919 dequant_lsps( lsps, 5, v, vec_sizes, 2, 00920 wmavoice_dq_lsp16i1, mul_lsf, base_lsf); 00921 dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2, 00922 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]); 00923 dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1, 00924 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]); 00925 } 00926 00931 static void dequant_lsp16r(GetBitContext *gb, 00932 double *i_lsps, const double *old, 00933 double *a1, double *a2, int q_mode) 00934 { 00935 static const uint16_t vec_sizes[3] = { 128, 128, 128 }; 00936 static const double mul_lsf[3] = { 00937 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3 00938 }; 00939 static const double base_lsf[3] = { 00940 M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2 00941 }; 00942 const float (*ipol_tab)[2][16] = q_mode ? 00943 wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a; 00944 uint16_t interpol, v[3]; 00945 int n; 00946 00947 dequant_lsp16i(gb, i_lsps); 00948 00949 interpol = get_bits(gb, 5); 00950 v[0] = get_bits(gb, 7); 00951 v[1] = get_bits(gb, 7); 00952 v[2] = get_bits(gb, 7); 00953 00954 for (n = 0; n < 16; n++) { 00955 double delta = old[n] - i_lsps[n]; 00956 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n]; 00957 a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 00958 } 00959 00960 dequant_lsps( a2, 10, v, vec_sizes, 1, 00961 wmavoice_dq_lsp16r1, mul_lsf, base_lsf); 00962 dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1, 00963 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]); 00964 dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1, 00965 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]); 00966 } 00967 00981 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, 00982 const int *pitch) 00983 { 00984 static const int16_t start_offset[94] = { 00985 -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 00986 13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26, 00987 27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43, 00988 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 00989 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 00990 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 00991 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 00992 141, 143, 145, 147, 149, 151, 153, 155, 157, 159 00993 }; 00994 int bits, offset; 00995 00996 /* position of pulse */ 00997 s->aw_idx_is_ext = 0; 00998 if ((bits = get_bits(gb, 6)) >= 54) { 00999 s->aw_idx_is_ext = 1; 01000 bits += (bits - 54) * 3 + get_bits(gb, 2); 01001 } 01002 01003 /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count 01004 * the distribution of the pulses in each block contained in this frame. */ 01005 s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16; 01006 for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ; 01007 s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0]; 01008 s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2; 01009 offset += s->aw_n_pulses[0] * pitch[0]; 01010 s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1]; 01011 s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2; 01012 01013 /* if continuing from a position before the block, reset position to 01014 * start of block (when corrected for the range over which it can be 01015 * spread in aw_pulse_set1()). */ 01016 if (start_offset[bits] < MAX_FRAMESIZE / 2) { 01017 while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0) 01018 s->aw_first_pulse_off[1] -= pitch[1]; 01019 if (start_offset[bits] < 0) 01020 while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0) 01021 s->aw_first_pulse_off[0] -= pitch[0]; 01022 } 01023 } 01024 01032 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, 01033 int block_idx, AMRFixed *fcb) 01034 { 01035 uint16_t use_mask[7]; // only 5 are used, rest is padding 01036 /* in this function, idx is the index in the 80-bit (+ padding) use_mask 01037 * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits 01038 * of idx are the position of the bit within a particular item in the 01039 * array (0 being the most significant bit, and 15 being the least 01040 * significant bit), and the remainder (>> 4) is the index in the 01041 * use_mask[]-array. This is faster and uses less memory than using a 01042 * 80-byte/80-int array. */ 01043 int pulse_off = s->aw_first_pulse_off[block_idx], 01044 pulse_start, n, idx, range, aidx, start_off = 0; 01045 01046 /* set offset of first pulse to within this block */ 01047 if (s->aw_n_pulses[block_idx] > 0) 01048 while (pulse_off + s->aw_pulse_range < 1) 01049 pulse_off += fcb->pitch_lag; 01050 01051 /* find range per pulse */ 01052 if (s->aw_n_pulses[0] > 0) { 01053 if (block_idx == 0) { 01054 range = 32; 01055 } else /* block_idx = 1 */ { 01056 range = 8; 01057 if (s->aw_n_pulses[block_idx] > 0) 01058 pulse_off = s->aw_next_pulse_off_cache; 01059 } 01060 } else 01061 range = 16; 01062 pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0; 01063 01064 /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly, 01065 * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus 01066 * we exclude that range from being pulsed again in this function. */ 01067 memset( use_mask, -1, 5 * sizeof(use_mask[0])); 01068 memset(&use_mask[5], 0, 2 * sizeof(use_mask[0])); 01069 if (s->aw_n_pulses[block_idx] > 0) 01070 for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) { 01071 int excl_range = s->aw_pulse_range; // always 16 or 24 01072 uint16_t *use_mask_ptr = &use_mask[idx >> 4]; 01073 int first_sh = 16 - (idx & 15); 01074 *use_mask_ptr++ &= 0xFFFF << first_sh; 01075 excl_range -= first_sh; 01076 if (excl_range >= 16) { 01077 *use_mask_ptr++ = 0; 01078 *use_mask_ptr &= 0xFFFF >> (excl_range - 16); 01079 } else 01080 *use_mask_ptr &= 0xFFFF >> excl_range; 01081 } 01082 01083 /* find the 'aidx'th offset that is not excluded */ 01084 aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4); 01085 for (n = 0; n <= aidx; pulse_start++) { 01086 for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ; 01087 if (idx >= MAX_FRAMESIZE / 2) { // find from zero 01088 if (use_mask[0]) idx = 0x0F; 01089 else if (use_mask[1]) idx = 0x1F; 01090 else if (use_mask[2]) idx = 0x2F; 01091 else if (use_mask[3]) idx = 0x3F; 01092 else if (use_mask[4]) idx = 0x4F; 01093 else return; 01094 idx -= av_log2_16bit(use_mask[idx >> 4]); 01095 } 01096 if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) { 01097 use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15)); 01098 n++; 01099 start_off = idx; 01100 } 01101 } 01102 01103 fcb->x[fcb->n] = start_off; 01104 fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0; 01105 fcb->n++; 01106 01107 /* set offset for next block, relative to start of that block */ 01108 n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag; 01109 s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0; 01110 } 01111 01119 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, 01120 int block_idx, AMRFixed *fcb) 01121 { 01122 int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx)); 01123 float v; 01124 01125 if (s->aw_n_pulses[block_idx] > 0) { 01126 int n, v_mask, i_mask, sh, n_pulses; 01127 01128 if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each 01129 n_pulses = 3; 01130 v_mask = 8; 01131 i_mask = 7; 01132 sh = 4; 01133 } else { // 4 pulses, 1:sign + 2:index each 01134 n_pulses = 4; 01135 v_mask = 4; 01136 i_mask = 3; 01137 sh = 3; 01138 } 01139 01140 for (n = n_pulses - 1; n >= 0; n--, val >>= sh) { 01141 fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0; 01142 fcb->x[fcb->n] = (val & i_mask) * n_pulses + n + 01143 s->aw_first_pulse_off[block_idx]; 01144 while (fcb->x[fcb->n] < 0) 01145 fcb->x[fcb->n] += fcb->pitch_lag; 01146 if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2) 01147 fcb->n++; 01148 } 01149 } else { 01150 int num2 = (val & 0x1FF) >> 1, delta, idx; 01151 01152 if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; } 01153 else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; } 01154 else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; } 01155 else { delta = 7; idx = num2 + 1 - 3 * 75; } 01156 v = (val & 0x200) ? -1.0 : 1.0; 01157 01158 fcb->no_repeat_mask |= 3 << fcb->n; 01159 fcb->x[fcb->n] = idx - delta; 01160 fcb->y[fcb->n] = v; 01161 fcb->x[fcb->n + 1] = idx; 01162 fcb->y[fcb->n + 1] = (val & 1) ? -v : v; 01163 fcb->n += 2; 01164 } 01165 } 01166 01180 static int pRNG(int frame_cntr, int block_num, int block_size) 01181 { 01182 /* array to simplify the calculation of z: 01183 * y = (x % 9) * 5 + 6; 01184 * z = (49995 * x) / y; 01185 * Since y only has 9 values, we can remove the division by using a 01186 * LUT and using FASTDIV-style divisions. For each of the 9 values 01187 * of y, we can rewrite z as: 01188 * z = x * (49995 / y) + x * ((49995 % y) / y) 01189 * In this table, each col represents one possible value of y, the 01190 * first number is 49995 / y, and the second is the FASTDIV variant 01191 * of 49995 % y / y. */ 01192 static const unsigned int div_tbl[9][2] = { 01193 { 8332, 3 * 715827883U }, // y = 6 01194 { 4545, 0 * 390451573U }, // y = 11 01195 { 3124, 11 * 268435456U }, // y = 16 01196 { 2380, 15 * 204522253U }, // y = 21 01197 { 1922, 23 * 165191050U }, // y = 26 01198 { 1612, 23 * 138547333U }, // y = 31 01199 { 1388, 27 * 119304648U }, // y = 36 01200 { 1219, 16 * 104755300U }, // y = 41 01201 { 1086, 39 * 93368855U } // y = 46 01202 }; 01203 unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr; 01204 if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6, 01205 // so this is effectively a modulo (%) 01206 y = x - 9 * MULH(477218589, x); // x % 9 01207 z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1])); 01208 // z = x * 49995 / (y * 5 + 6) 01209 return z % (1000 - block_size); 01210 } 01211 01216 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, 01217 int block_idx, int size, 01218 const struct frame_type_desc *frame_desc, 01219 float *excitation) 01220 { 01221 float gain; 01222 int n, r_idx; 01223 01224 assert(size <= MAX_FRAMESIZE); 01225 01226 /* Set the offset from which we start reading wmavoice_std_codebook */ 01227 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { 01228 r_idx = pRNG(s->frame_cntr, block_idx, size); 01229 gain = s->silence_gain; 01230 } else /* FCB_TYPE_HARDCODED */ { 01231 r_idx = get_bits(gb, 8); 01232 gain = wmavoice_gain_universal[get_bits(gb, 6)]; 01233 } 01234 01235 /* Clear gain prediction parameters */ 01236 memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err)); 01237 01238 /* Apply gain to hardcoded codebook and use that as excitation signal */ 01239 for (n = 0; n < size; n++) 01240 excitation[n] = wmavoice_std_codebook[r_idx + n] * gain; 01241 } 01242 01247 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, 01248 int block_idx, int size, 01249 int block_pitch_sh2, 01250 const struct frame_type_desc *frame_desc, 01251 float *excitation) 01252 { 01253 static const float gain_coeff[6] = { 01254 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458 01255 }; 01256 float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain; 01257 int n, idx, gain_weight; 01258 AMRFixed fcb; 01259 01260 assert(size <= MAX_FRAMESIZE / 2); 01261 memset(pulses, 0, sizeof(*pulses) * size); 01262 01263 fcb.pitch_lag = block_pitch_sh2 >> 2; 01264 fcb.pitch_fac = 1.0; 01265 fcb.no_repeat_mask = 0; 01266 fcb.n = 0; 01267 01268 /* For the other frame types, this is where we apply the innovation 01269 * (fixed) codebook pulses of the speech signal. */ 01270 if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01271 aw_pulse_set1(s, gb, block_idx, &fcb); 01272 aw_pulse_set2(s, gb, block_idx, &fcb); 01273 } else /* FCB_TYPE_EXC_PULSES */ { 01274 int offset_nbits = 5 - frame_desc->log_n_blocks; 01275 01276 fcb.no_repeat_mask = -1; 01277 /* similar to ff_decode_10_pulses_35bits(), but with single pulses 01278 * (instead of double) for a subset of pulses */ 01279 for (n = 0; n < 5; n++) { 01280 float sign; 01281 int pos1, pos2; 01282 01283 sign = get_bits1(gb) ? 1.0 : -1.0; 01284 pos1 = get_bits(gb, offset_nbits); 01285 fcb.x[fcb.n] = n + 5 * pos1; 01286 fcb.y[fcb.n++] = sign; 01287 if (n < frame_desc->dbl_pulses) { 01288 pos2 = get_bits(gb, offset_nbits); 01289 fcb.x[fcb.n] = n + 5 * pos2; 01290 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign; 01291 } 01292 } 01293 } 01294 ff_set_fixed_vector(pulses, &fcb, 1.0, size); 01295 01296 /* Calculate gain for adaptive & fixed codebook signal. 01297 * see ff_amr_set_fixed_gain(). */ 01298 idx = get_bits(gb, 7); 01299 fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) - 01300 5.2409161640 + wmavoice_gain_codebook_fcb[idx]); 01301 acb_gain = wmavoice_gain_codebook_acb[idx]; 01302 pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx], 01303 -2.9957322736 /* log(0.05) */, 01304 1.6094379124 /* log(5.0) */); 01305 01306 gain_weight = 8 >> frame_desc->log_n_blocks; 01307 memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err, 01308 sizeof(*s->gain_pred_err) * (6 - gain_weight)); 01309 for (n = 0; n < gain_weight; n++) 01310 s->gain_pred_err[n] = pred_err; 01311 01312 /* Calculation of adaptive codebook */ 01313 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { 01314 int len; 01315 for (n = 0; n < size; n += len) { 01316 int next_idx_sh16; 01317 int abs_idx = block_idx * size + n; 01318 int pitch_sh16 = (s->last_pitch_val << 16) + 01319 s->pitch_diff_sh16 * abs_idx; 01320 int pitch = (pitch_sh16 + 0x6FFF) >> 16; 01321 int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000; 01322 idx = idx_sh16 >> 16; 01323 if (s->pitch_diff_sh16) { 01324 if (s->pitch_diff_sh16 > 0) { 01325 next_idx_sh16 = (idx_sh16) &~ 0xFFFF; 01326 } else 01327 next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF; 01328 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8, 01329 1, size - n); 01330 } else 01331 len = size; 01332 01333 ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch], 01334 wmavoice_ipol1_coeffs, 17, 01335 idx, 9, len); 01336 } 01337 } else /* ACB_TYPE_HAMMING */ { 01338 int block_pitch = block_pitch_sh2 >> 2; 01339 idx = block_pitch_sh2 & 3; 01340 if (idx) { 01341 ff_acelp_interpolatef(excitation, &excitation[-block_pitch], 01342 wmavoice_ipol2_coeffs, 4, 01343 idx, 8, size); 01344 } else 01345 av_memcpy_backptr(excitation, sizeof(float) * block_pitch, 01346 sizeof(float) * size); 01347 } 01348 01349 /* Interpolate ACB/FCB and use as excitation signal */ 01350 ff_weighted_vector_sumf(excitation, excitation, pulses, 01351 acb_gain, fcb_gain, size); 01352 } 01353 01370 static void synth_block(WMAVoiceContext *s, GetBitContext *gb, 01371 int block_idx, int size, 01372 int block_pitch_sh2, 01373 const double *lsps, const double *prev_lsps, 01374 const struct frame_type_desc *frame_desc, 01375 float *excitation, float *synth) 01376 { 01377 double i_lsps[MAX_LSPS]; 01378 float lpcs[MAX_LSPS]; 01379 float fac; 01380 int n; 01381 01382 if (frame_desc->acb_type == ACB_TYPE_NONE) 01383 synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation); 01384 else 01385 synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2, 01386 frame_desc, excitation); 01387 01388 /* convert interpolated LSPs to LPCs */ 01389 fac = (block_idx + 0.5) / frame_desc->n_blocks; 01390 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01391 i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n])); 01392 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01393 01394 /* Speech synthesis */ 01395 ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps); 01396 } 01397 01413 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, 01414 float *samples, 01415 const double *lsps, const double *prev_lsps, 01416 float *excitation, float *synth) 01417 { 01418 WMAVoiceContext *s = ctx->priv_data; 01419 int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val; 01420 int pitch[MAX_BLOCKS], last_block_pitch; 01421 01422 /* Parse frame type ("frame header"), see frame_descs */ 01423 int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], 01424 block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks; 01425 01426 if (bd_idx < 0) { 01427 av_log(ctx, AV_LOG_ERROR, 01428 "Invalid frame type VLC code, skipping\n"); 01429 return -1; 01430 } 01431 01432 /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */ 01433 if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) { 01434 /* Pitch is provided per frame, which is interpreted as the pitch of 01435 * the last sample of the last block of this frame. We can interpolate 01436 * the pitch of other blocks (and even pitch-per-sample) by gradually 01437 * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */ 01438 n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1; 01439 log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1; 01440 cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits); 01441 cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1); 01442 if (s->last_acb_type == ACB_TYPE_NONE || 01443 20 * abs(cur_pitch_val - s->last_pitch_val) > 01444 (cur_pitch_val + s->last_pitch_val)) 01445 s->last_pitch_val = cur_pitch_val; 01446 01447 /* pitch per block */ 01448 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 01449 int fac = n * 2 + 1; 01450 01451 pitch[n] = (MUL16(fac, cur_pitch_val) + 01452 MUL16((n_blocks_x2 - fac), s->last_pitch_val) + 01453 frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2; 01454 } 01455 01456 /* "pitch-diff-per-sample" for calculation of pitch per sample */ 01457 s->pitch_diff_sh16 = 01458 ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE; 01459 } 01460 01461 /* Global gain (if silence) and pitch-adaptive window coordinates */ 01462 switch (frame_descs[bd_idx].fcb_type) { 01463 case FCB_TYPE_SILENCE: 01464 s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)]; 01465 break; 01466 case FCB_TYPE_AW_PULSES: 01467 aw_parse_coords(s, gb, pitch); 01468 break; 01469 } 01470 01471 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 01472 int bl_pitch_sh2; 01473 01474 /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */ 01475 switch (frame_descs[bd_idx].acb_type) { 01476 case ACB_TYPE_HAMMING: { 01477 /* Pitch is given per block. Per-block pitches are encoded as an 01478 * absolute value for the first block, and then delta values 01479 * relative to this value) for all subsequent blocks. The scale of 01480 * this pitch value is semi-logaritmic compared to its use in the 01481 * decoder, so we convert it to normal scale also. */ 01482 int block_pitch, 01483 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2, 01484 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1, 01485 t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1; 01486 01487 if (n == 0) { 01488 block_pitch = get_bits(gb, s->block_pitch_nbits); 01489 } else 01490 block_pitch = last_block_pitch - s->block_delta_pitch_hrange + 01491 get_bits(gb, s->block_delta_pitch_nbits); 01492 /* Convert last_ so that any next delta is within _range */ 01493 last_block_pitch = av_clip(block_pitch, 01494 s->block_delta_pitch_hrange, 01495 s->block_pitch_range - 01496 s->block_delta_pitch_hrange); 01497 01498 /* Convert semi-log-style scale back to normal scale */ 01499 if (block_pitch < t1) { 01500 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch; 01501 } else { 01502 block_pitch -= t1; 01503 if (block_pitch < t2) { 01504 bl_pitch_sh2 = 01505 (s->block_conv_table[1] << 2) + (block_pitch << 1); 01506 } else { 01507 block_pitch -= t2; 01508 if (block_pitch < t3) { 01509 bl_pitch_sh2 = 01510 (s->block_conv_table[2] + block_pitch) << 2; 01511 } else 01512 bl_pitch_sh2 = s->block_conv_table[3] << 2; 01513 } 01514 } 01515 pitch[n] = bl_pitch_sh2 >> 2; 01516 break; 01517 } 01518 01519 case ACB_TYPE_ASYMMETRIC: { 01520 bl_pitch_sh2 = pitch[n] << 2; 01521 break; 01522 } 01523 01524 default: // ACB_TYPE_NONE has no pitch 01525 bl_pitch_sh2 = 0; 01526 break; 01527 } 01528 01529 synth_block(s, gb, n, block_nsamples, bl_pitch_sh2, 01530 lsps, prev_lsps, &frame_descs[bd_idx], 01531 &excitation[n * block_nsamples], 01532 &synth[n * block_nsamples]); 01533 } 01534 01535 /* Averaging projection filter, if applicable. Else, just copy samples 01536 * from synthesis buffer */ 01537 if (s->do_apf) { 01538 double i_lsps[MAX_LSPS]; 01539 float lpcs[MAX_LSPS]; 01540 01541 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01542 i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n])); 01543 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01544 postfilter(s, synth, samples, 80, lpcs, 01545 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx], 01546 frame_descs[bd_idx].fcb_type, pitch[0]); 01547 01548 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01549 i_lsps[n] = cos(lsps[n]); 01550 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01551 postfilter(s, &synth[80], &samples[80], 80, lpcs, 01552 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80], 01553 frame_descs[bd_idx].fcb_type, pitch[0]); 01554 } else 01555 memcpy(samples, synth, 160 * sizeof(synth[0])); 01556 01557 /* Cache values for next frame */ 01558 s->frame_cntr++; 01559 if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%) 01560 s->last_acb_type = frame_descs[bd_idx].acb_type; 01561 switch (frame_descs[bd_idx].acb_type) { 01562 case ACB_TYPE_NONE: 01563 s->last_pitch_val = 0; 01564 break; 01565 case ACB_TYPE_ASYMMETRIC: 01566 s->last_pitch_val = cur_pitch_val; 01567 break; 01568 case ACB_TYPE_HAMMING: 01569 s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1]; 01570 break; 01571 } 01572 01573 return 0; 01574 } 01575 01588 static void stabilize_lsps(double *lsps, int num) 01589 { 01590 int n, m, l; 01591 01592 /* set minimum value for first, maximum value for last and minimum 01593 * spacing between LSF values. 01594 * Very similar to ff_set_min_dist_lsf(), but in double. */ 01595 lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI); 01596 for (n = 1; n < num; n++) 01597 lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI); 01598 lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI); 01599 01600 /* reorder (looks like one-time / non-recursed bubblesort). 01601 * Very similar to ff_sort_nearly_sorted_floats(), but in double. */ 01602 for (n = 1; n < num; n++) { 01603 if (lsps[n] < lsps[n - 1]) { 01604 for (m = 1; m < num; m++) { 01605 double tmp = lsps[m]; 01606 for (l = m - 1; l >= 0; l--) { 01607 if (lsps[l] <= tmp) break; 01608 lsps[l + 1] = lsps[l]; 01609 } 01610 lsps[l + 1] = tmp; 01611 } 01612 break; 01613 } 01614 } 01615 } 01616 01626 static int check_bits_for_superframe(GetBitContext *orig_gb, 01627 WMAVoiceContext *s) 01628 { 01629 GetBitContext s_gb, *gb = &s_gb; 01630 int n, need_bits, bd_idx; 01631 const struct frame_type_desc *frame_desc; 01632 01633 /* initialize a copy */ 01634 init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits); 01635 skip_bits_long(gb, get_bits_count(orig_gb)); 01636 assert(get_bits_left(gb) == get_bits_left(orig_gb)); 01637 01638 /* superframe header */ 01639 if (get_bits_left(gb) < 14) 01640 return 1; 01641 if (!get_bits1(gb)) 01642 return -1; // WMAPro-in-WMAVoice superframe 01643 if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe 01644 if (s->has_residual_lsps) { // residual LSPs (for all frames) 01645 if (get_bits_left(gb) < s->sframe_lsp_bitsize) 01646 return 1; 01647 skip_bits_long(gb, s->sframe_lsp_bitsize); 01648 } 01649 01650 /* frames */ 01651 for (n = 0; n < MAX_FRAMES; n++) { 01652 int aw_idx_is_ext = 0; 01653 01654 if (!s->has_residual_lsps) { // independent LSPs (per-frame) 01655 if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1; 01656 skip_bits_long(gb, s->frame_lsp_bitsize); 01657 } 01658 bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)]; 01659 if (bd_idx < 0) 01660 return -1; // invalid frame type VLC code 01661 frame_desc = &frame_descs[bd_idx]; 01662 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { 01663 if (get_bits_left(gb) < s->pitch_nbits) 01664 return 1; 01665 skip_bits_long(gb, s->pitch_nbits); 01666 } 01667 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { 01668 skip_bits(gb, 8); 01669 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01670 int tmp = get_bits(gb, 6); 01671 if (tmp >= 0x36) { 01672 skip_bits(gb, 2); 01673 aw_idx_is_ext = 1; 01674 } 01675 } 01676 01677 /* blocks */ 01678 if (frame_desc->acb_type == ACB_TYPE_HAMMING) { 01679 need_bits = s->block_pitch_nbits + 01680 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits; 01681 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01682 need_bits = 2 * !aw_idx_is_ext; 01683 } else 01684 need_bits = 0; 01685 need_bits += frame_desc->frame_size; 01686 if (get_bits_left(gb) < need_bits) 01687 return 1; 01688 skip_bits_long(gb, need_bits); 01689 } 01690 01691 return 0; 01692 } 01693 01714 static int synth_superframe(AVCodecContext *ctx, 01715 float *samples, int *data_size) 01716 { 01717 WMAVoiceContext *s = ctx->priv_data; 01718 GetBitContext *gb = &s->gb, s_gb; 01719 int n, res, n_samples = 480; 01720 double lsps[MAX_FRAMES][MAX_LSPS]; 01721 const double *mean_lsf = s->lsps == 16 ? 01722 wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode]; 01723 float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12]; 01724 float synth[MAX_LSPS + MAX_SFRAMESIZE]; 01725 01726 memcpy(synth, s->synth_history, 01727 s->lsps * sizeof(*synth)); 01728 memcpy(excitation, s->excitation_history, 01729 s->history_nsamples * sizeof(*excitation)); 01730 01731 if (s->sframe_cache_size > 0) { 01732 gb = &s_gb; 01733 init_get_bits(gb, s->sframe_cache, s->sframe_cache_size); 01734 s->sframe_cache_size = 0; 01735 } 01736 01737 if ((res = check_bits_for_superframe(gb, s)) == 1) return 1; 01738 01739 /* First bit is speech/music bit, it differentiates between WMAVoice 01740 * speech samples (the actual codec) and WMAVoice music samples, which 01741 * are really WMAPro-in-WMAVoice-superframes. I've never seen those in 01742 * the wild yet. */ 01743 if (!get_bits1(gb)) { 01744 av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1); 01745 return -1; 01746 } 01747 01748 /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */ 01749 if (get_bits1(gb)) { 01750 if ((n_samples = get_bits(gb, 12)) > 480) { 01751 av_log(ctx, AV_LOG_ERROR, 01752 "Superframe encodes >480 samples (%d), not allowed\n", 01753 n_samples); 01754 return -1; 01755 } 01756 } 01757 /* Parse LSPs, if global for the superframe (can also be per-frame). */ 01758 if (s->has_residual_lsps) { 01759 double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2]; 01760 01761 for (n = 0; n < s->lsps; n++) 01762 prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n]; 01763 01764 if (s->lsps == 10) { 01765 dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode); 01766 } else /* s->lsps == 16 */ 01767 dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode); 01768 01769 for (n = 0; n < s->lsps; n++) { 01770 lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]); 01771 lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]); 01772 lsps[2][n] += mean_lsf[n]; 01773 } 01774 for (n = 0; n < 3; n++) 01775 stabilize_lsps(lsps[n], s->lsps); 01776 } 01777 01778 /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */ 01779 for (n = 0; n < 3; n++) { 01780 if (!s->has_residual_lsps) { 01781 int m; 01782 01783 if (s->lsps == 10) { 01784 dequant_lsp10i(gb, lsps[n]); 01785 } else /* s->lsps == 16 */ 01786 dequant_lsp16i(gb, lsps[n]); 01787 01788 for (m = 0; m < s->lsps; m++) 01789 lsps[n][m] += mean_lsf[m]; 01790 stabilize_lsps(lsps[n], s->lsps); 01791 } 01792 01793 if ((res = synth_frame(ctx, gb, n, 01794 &samples[n * MAX_FRAMESIZE], 01795 lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1], 01796 &excitation[s->history_nsamples + n * MAX_FRAMESIZE], 01797 &synth[s->lsps + n * MAX_FRAMESIZE]))) 01798 return res; 01799 } 01800 01801 /* Statistics? FIXME - we don't check for length, a slight overrun 01802 * will be caught by internal buffer padding, and anything else 01803 * will be skipped, not read. */ 01804 if (get_bits1(gb)) { 01805 res = get_bits(gb, 4); 01806 skip_bits(gb, 10 * (res + 1)); 01807 } 01808 01809 /* Specify nr. of output samples */ 01810 *data_size = n_samples * sizeof(float); 01811 01812 /* Update history */ 01813 memcpy(s->prev_lsps, lsps[2], 01814 s->lsps * sizeof(*s->prev_lsps)); 01815 memcpy(s->synth_history, &synth[MAX_SFRAMESIZE], 01816 s->lsps * sizeof(*synth)); 01817 memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE], 01818 s->history_nsamples * sizeof(*excitation)); 01819 if (s->do_apf) 01820 memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE], 01821 s->history_nsamples * sizeof(*s->zero_exc_pf)); 01822 01823 return 0; 01824 } 01825 01833 static int parse_packet_header(WMAVoiceContext *s) 01834 { 01835 GetBitContext *gb = &s->gb; 01836 unsigned int res; 01837 01838 if (get_bits_left(gb) < 11) 01839 return 1; 01840 skip_bits(gb, 4); // packet sequence number 01841 s->has_residual_lsps = get_bits1(gb); 01842 do { 01843 res = get_bits(gb, 6); // number of superframes per packet 01844 // (minus first one if there is spillover) 01845 if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize) 01846 return 1; 01847 } while (res == 0x3F); 01848 s->spillover_nbits = get_bits(gb, s->spillover_bitsize); 01849 01850 return 0; 01851 } 01852 01868 static void copy_bits(PutBitContext *pb, 01869 const uint8_t *data, int size, 01870 GetBitContext *gb, int nbits) 01871 { 01872 int rmn_bytes, rmn_bits; 01873 01874 rmn_bits = rmn_bytes = get_bits_left(gb); 01875 if (rmn_bits < nbits) 01876 return; 01877 rmn_bits &= 7; rmn_bytes >>= 3; 01878 if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0) 01879 put_bits(pb, rmn_bits, get_bits(gb, rmn_bits)); 01880 ff_copy_bits(pb, data + size - rmn_bytes, 01881 FFMIN(nbits - rmn_bits, rmn_bytes << 3)); 01882 } 01883 01895 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, 01896 int *data_size, AVPacket *avpkt) 01897 { 01898 WMAVoiceContext *s = ctx->priv_data; 01899 GetBitContext *gb = &s->gb; 01900 int size, res, pos; 01901 01902 if (*data_size < 480 * sizeof(float)) { 01903 av_log(ctx, AV_LOG_ERROR, 01904 "Output buffer too small (%d given - %lu needed)\n", 01905 *data_size, 480 * sizeof(float)); 01906 return -1; 01907 } 01908 *data_size = 0; 01909 01910 /* Packets are sometimes a multiple of ctx->block_align, with a packet 01911 * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer 01912 * feeds us ASF packets, which may concatenate multiple "codec" packets 01913 * in a single "muxer" packet, so we artificially emulate that by 01914 * capping the packet size at ctx->block_align. */ 01915 for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align); 01916 if (!size) 01917 return 0; 01918 init_get_bits(&s->gb, avpkt->data, size << 3); 01919 01920 /* size == ctx->block_align is used to indicate whether we are dealing with 01921 * a new packet or a packet of which we already read the packet header 01922 * previously. */ 01923 if (size == ctx->block_align) { // new packet header 01924 if ((res = parse_packet_header(s)) < 0) 01925 return res; 01926 01927 /* If the packet header specifies a s->spillover_nbits, then we want 01928 * to push out all data of the previous packet (+ spillover) before 01929 * continuing to parse new superframes in the current packet. */ 01930 if (s->spillover_nbits > 0) { 01931 if (s->sframe_cache_size > 0) { 01932 int cnt = get_bits_count(gb); 01933 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits); 01934 flush_put_bits(&s->pb); 01935 s->sframe_cache_size += s->spillover_nbits; 01936 if ((res = synth_superframe(ctx, data, data_size)) == 0 && 01937 *data_size > 0) { 01938 cnt += s->spillover_nbits; 01939 s->skip_bits_next = cnt & 7; 01940 return cnt >> 3; 01941 } else 01942 skip_bits_long (gb, s->spillover_nbits - cnt + 01943 get_bits_count(gb)); // resync 01944 } else 01945 skip_bits_long(gb, s->spillover_nbits); // resync 01946 } 01947 } else if (s->skip_bits_next) 01948 skip_bits(gb, s->skip_bits_next); 01949 01950 /* Try parsing superframes in current packet */ 01951 s->sframe_cache_size = 0; 01952 s->skip_bits_next = 0; 01953 pos = get_bits_left(gb); 01954 if ((res = synth_superframe(ctx, data, data_size)) < 0) { 01955 return res; 01956 } else if (*data_size > 0) { 01957 int cnt = get_bits_count(gb); 01958 s->skip_bits_next = cnt & 7; 01959 return cnt >> 3; 01960 } else if ((s->sframe_cache_size = pos) > 0) { 01961 /* rewind bit reader to start of last (incomplete) superframe... */ 01962 init_get_bits(gb, avpkt->data, size << 3); 01963 skip_bits_long(gb, (size << 3) - pos); 01964 assert(get_bits_left(gb) == pos); 01965 01966 /* ...and cache it for spillover in next packet */ 01967 init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE); 01968 copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size); 01969 // FIXME bad - just copy bytes as whole and add use the 01970 // skip_bits_next field 01971 } 01972 01973 return size; 01974 } 01975 01976 static av_cold int wmavoice_decode_end(AVCodecContext *ctx) 01977 { 01978 WMAVoiceContext *s = ctx->priv_data; 01979 01980 if (s->do_apf) { 01981 ff_rdft_end(&s->rdft); 01982 ff_rdft_end(&s->irdft); 01983 ff_dct_end(&s->dct); 01984 ff_dct_end(&s->dst); 01985 } 01986 01987 return 0; 01988 } 01989 01990 static av_cold void wmavoice_flush(AVCodecContext *ctx) 01991 { 01992 WMAVoiceContext *s = ctx->priv_data; 01993 int n; 01994 01995 s->postfilter_agc = 0; 01996 s->sframe_cache_size = 0; 01997 s->skip_bits_next = 0; 01998 for (n = 0; n < s->lsps; n++) 01999 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0); 02000 memset(s->excitation_history, 0, 02001 sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY); 02002 memset(s->synth_history, 0, 02003 sizeof(*s->synth_history) * MAX_LSPS); 02004 memset(s->gain_pred_err, 0, 02005 sizeof(s->gain_pred_err)); 02006 02007 if (s->do_apf) { 02008 memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0, 02009 sizeof(*s->synth_filter_out_buf) * s->lsps); 02010 memset(s->dcf_mem, 0, 02011 sizeof(*s->dcf_mem) * 2); 02012 memset(s->zero_exc_pf, 0, 02013 sizeof(*s->zero_exc_pf) * s->history_nsamples); 02014 memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache)); 02015 } 02016 } 02017 02018 AVCodec wmavoice_decoder = { 02019 "wmavoice", 02020 AVMEDIA_TYPE_AUDIO, 02021 CODEC_ID_WMAVOICE, 02022 sizeof(WMAVoiceContext), 02023 wmavoice_decode_init, 02024 NULL, 02025 wmavoice_decode_end, 02026 wmavoice_decode_packet, 02027 CODEC_CAP_SUBFRAMES, 02028 .flush = wmavoice_flush, 02029 .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"), 02030 };