libavcodec/wmavoice.c

   1 /*
   2  * Windows Media Audio Voice decoder.
   3  * Copyright (c) 2009 Ronald S. Bultje
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * @brief Windows Media Audio Voice compatible decoder
  25  * @author Ronald S. Bultje <rsbultje@gmail.com>
  26  */
  27
  28 #include <math.h>
  29 #include "avcodec.h"
  30 #include "get_bits.h"
  31 #include "put_bits.h"
  32 #include "wmavoice_data.h"
  33 #include "celp_math.h"
  34 #include "celp_filters.h"
  35 #include "acelp_vectors.h"
  36 #include "acelp_filters.h"
  37 #include "lsp.h"
  38 #include "libavutil/lzo.h"
  39 #include "dct.h"
  40 #include "rdft.h"
  41 #include "sinewin.h"
  42
  43 #define MAX_BLOCKS           8   ///< maximum number of blocks per frame
  44 #define MAX_LSPS             16  ///< maximum filter order
  45 #define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
  46                                  ///< of 16 for ASM input buffer alignment
  47 #define MAX_FRAMES           3   ///< maximum number of frames per superframe
  48 #define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
  49 #define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
  50 #define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
  51                                  ///< maximum number of samples per superframe
  52 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
  53                                  ///< was split over two packets
  54 #define VLC_NBITS            6   ///< number of bits to read per VLC iteration
  55
  56 /**
  57  * Frame type VLC coding.
  58  */
  59 static VLC frame_type_vlc;
  60
  61 /**
  62  * Adaptive codebook types.
  63  */
  64 enum {
  65     ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
  66     ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
  67                              ///< we interpolate to get a per-sample pitch.
  68                              ///< Signal is generated using an asymmetric sinc
  69                              ///< window function
  70                              ///< @note see #wmavoice_ipol1_coeffs
  71     ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
  72                              ///< a Hamming sinc window function
  73                              ///< @note see #wmavoice_ipol2_coeffs
  74 };
  75
  76 /**
  77  * Fixed codebook types.
  78  */
  79 enum {
  80     FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
  81                              ///< generated from a hardcoded (fixed) codebook
  82                              ///< with per-frame (low) gain values
  83     FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
  84                              ///< gain values
  85     FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
  86                              ///< used in particular for low-bitrate streams
  87     FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
  88                              ///< combinations of either single pulses or
  89                              ///< pulse pairs
  90 };
  91
  92 /**
  93  * Description of frame types.
  94  */
  95 static const struct frame_type_desc {
  96     uint8_t n_blocks;     ///< amount of blocks per frame (each block
  97                           ///< (contains 160/#n_blocks samples)
  98     uint8_t log_n_blocks; ///< log2(#n_blocks)
  99     uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
 100     uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
 101     uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
 102                           ///< (rather than just one single pulse)
 103                           ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
 104     uint16_t frame_size;  ///< the amount of bits that make up the block
 105                           ///< data (per frame)
 106 } frame_descs[17] = {
 107     { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0,   0 },
 108     { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0,  28 },
 109     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0,  46 },
 110     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2,  80 },
 111     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
 112     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
 113     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
 114     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
 115     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0,  64 },
 116     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2,  80 },
 117     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 104 },
 118     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 108 },
 119     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 132 },
 120     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 168 },
 121     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 176 },
 122     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 208 },
 123     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 256 }
 124 };
 125
 126 /**
 127  * WMA Voice decoding context.
 128  */
 129 typedef struct {
 130     /**
 131      * @defgroup struct_global Global values
 132      * Global values, specified in the stream header / extradata or used
 133      * all over.
 134      * @{
 135      */
 136     GetBitContext gb;             ///< packet bitreader. During decoder init,
 137                                   ///< it contains the extradata from the
 138                                   ///< demuxer. During decoding, it contains
 139                                   ///< packet data.
 140     int8_t vbm_tree[25];          ///< converts VLC codes to frame type
 141
 142     int spillover_bitsize;        ///< number of bits used to specify
 143                                   ///< #spillover_nbits in the packet header
 144                                   ///< = ceil(log2(ctx->block_align << 3))
 145     int history_nsamples;         ///< number of samples in history for signal
 146                                   ///< prediction (through ACB)
 147
 148     /* postfilter specific values */
 149     int do_apf;                   ///< whether to apply the averaged
 150                                   ///< projection filter (APF)
 151     int denoise_strength;         ///< strength of denoising in Wiener filter
 152                                   ///< [0-11]
 153     int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
 154                                   ///< Wiener filter coefficients (postfilter)
 155     int dc_level;                 ///< Predicted amount of DC noise, based
 156                                   ///< on which a DC removal filter is used
 157
 158     int lsps;                     ///< number of LSPs per frame [10 or 16]
 159     int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
 160     int lsp_def_mode;             ///< defines different sets of LSP defaults
 161                                   ///< [0, 1]
 162     int frame_lsp_bitsize;        ///< size (in bits) of LSPs, when encoded
 163                                   ///< per-frame (independent coding)
 164     int sframe_lsp_bitsize;       ///< size (in bits) of LSPs, when encoded
 165                                   ///< per superframe (residual coding)
 166
 167     int min_pitch_val;            ///< base value for pitch parsing code
 168     int max_pitch_val;            ///< max value + 1 for pitch parsing
 169     int pitch_nbits;              ///< number of bits used to specify the
 170                                   ///< pitch value in the frame header
 171     int block_pitch_nbits;        ///< number of bits used to specify the
 172                                   ///< first block's pitch value
 173     int block_pitch_range;        ///< range of the block pitch
 174     int block_delta_pitch_nbits;  ///< number of bits used to specify the
 175                                   ///< delta pitch between this and the last
 176                                   ///< block's pitch value, used in all but
 177                                   ///< first block
 178     int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
 179                                   ///< from -this to +this-1)
 180     uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
 181                                   ///< conversion
 182
 183     /**
 184      * @}
 185      * @defgroup struct_packet Packet values
 186      * Packet values, specified in the packet header or related to a packet.
 187      * A packet is considered to be a single unit of data provided to this
 188      * decoder by the demuxer.
 189      * @{
 190      */
 191     int spillover_nbits;          ///< number of bits of the previous packet's
 192                                   ///< last superframe preceeding this
 193                                   ///< packet's first full superframe (useful
 194                                   ///< for re-synchronization also)
 195     int has_residual_lsps;        ///< if set, superframes contain one set of
 196                                   ///< LSPs that cover all frames, encoded as
 197                                   ///< independent and residual LSPs; if not
 198                                   ///< set, each frame contains its own, fully
 199                                   ///< independent, LSPs
 200     int skip_bits_next;           ///< number of bits to skip at the next call
 201                                   ///< to #wmavoice_decode_packet() (since
 202                                   ///< they're part of the previous superframe)
 203
 204     uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE];
 205                                   ///< cache for superframe data split over
 206                                   ///< multiple packets
 207     int sframe_cache_size;        ///< set to >0 if we have data from an
 208                                   ///< (incomplete) superframe from a previous
 209                                   ///< packet that spilled over in the current
 210                                   ///< packet; specifies the amount of bits in
 211                                   ///< #sframe_cache
 212     PutBitContext pb;             ///< bitstream writer for #sframe_cache
 213
 214     /**
 215      * @}
 216      * @defgroup struct_frame Frame and superframe values
 217      * Superframe and frame data - these can change from frame to frame,
 218      * although some of them do in that case serve as a cache / history for
 219      * the next frame or superframe.
 220      * @{
 221      */
 222     double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
 223                                   ///< superframe
 224     int last_pitch_val;           ///< pitch value of the previous frame
 225     int last_acb_type;            ///< frame type [0-2] of the previous frame
 226     int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
 227                                   ///< << 16) / #MAX_FRAMESIZE
 228     float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
 229
 230     int aw_idx_is_ext;            ///< whether the AW index was encoded in
 231                                   ///< 8 bits (instead of 6)
 232     int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
 233                                   ///< can apply the pulse, relative to the
 234                                   ///< value in aw_first_pulse_off. The exact
 235                                   ///< position of the first AW-pulse is within
 236                                   ///< [pulse_off, pulse_off + this], and
 237                                   ///< depends on bitstream values; [16 or 24]
 238     int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
 239                                   ///< that this number can be negative (in
 240                                   ///< which case it basically means "zero")
 241     int aw_first_pulse_off[2];    ///< index of first sample to which to
 242                                   ///< apply AW-pulses, or -0xff if unset
 243     int aw_next_pulse_off_cache;  ///< the position (relative to start of the
 244                                   ///< second block) at which pulses should
 245                                   ///< start to be positioned, serves as a
 246                                   ///< cache for pitch-adaptive window pulses
 247                                   ///< between blocks
 248
 249     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
 250                                   ///< only used for comfort noise in #pRNG()
 251     float gain_pred_err[6];       ///< cache for gain prediction
 252     float excitation_history[MAX_SIGNAL_HISTORY];
 253                                   ///< cache of the signal of previous
 254                                   ///< superframes, used as a history for
 255                                   ///< signal generation
 256     float synth_history[MAX_LSPS]; ///< see #excitation_history
 257     /**
 258      * @}
 259      * @defgroup post_filter Postfilter values
 260      * Variables used for postfilter implementation, mostly history for
 261      * smoothing and so on, and context variables for FFT/iFFT.
 262      * @{
 263      */
 264     RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
 265                                   ///< postfilter (for denoise filter)
 266     DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
 267                                   ///< transform, part of postfilter)
 268     float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
 269                                   ///< range
 270     float postfilter_agc;         ///< gain control memory, used in
 271                                   ///< #adaptive_gain_control()
 272     float dcf_mem[2];             ///< DC filter history
 273     float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
 274                                   ///< zero filter output (i.e. excitation)
 275                                   ///< by postfilter
 276     float denoise_filter_cache[MAX_FRAMESIZE];
 277     int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
 278     DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
 279                                   ///< aligned buffer for LPC tilting
 280     DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
 281                                   ///< aligned buffer for denoise coefficients
 282     DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
 283                                   ///< aligned buffer for postfilter speech
 284                                   ///< synthesis
 285     /**
 286      * @}
 287      */
 288 } WMAVoiceContext;
 289
 290 /**
 291  * Set up the variable bit mode (VBM) tree from container extradata.
 292  * @param gb bit I/O context.
 293  *           The bit context (s->gb) should be loaded with byte 23-46 of the
 294  *           container extradata (i.e. the ones containing the VBM tree).
 295  * @param vbm_tree pointer to array to which the decoded VBM tree will be
 296  *                 written.
 297  * @return 0 on success, <0 on error.
 298  */
 299 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
 300 {
 301     static const uint8_t bits[] = {
 302          2,  2,  2,  4,  4,  4,
 303          6,  6,  6,  8,  8,  8,
 304         10, 10, 10, 12, 12, 12,
 305         14, 14, 14, 14
 306     };
 307     static const uint16_t codes[] = {
 308           0x0000, 0x0001, 0x0002,        //              00/01/10
 309           0x000c, 0x000d, 0x000e,        //           11+00/01/10
 310           0x003c, 0x003d, 0x003e,        //         1111+00/01/10
 311           0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
 312           0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
 313           0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
 314           0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
 315     };
 316     int cntr[8], n, res;
 317
 318     memset(vbm_tree, 0xff, sizeof(vbm_tree));
 319     memset(cntr,     0,    sizeof(cntr));
 320     for (n = 0; n < 17; n++) {
 321         res = get_bits(gb, 3);
 322         if (cntr[res] > 3) // should be >= 3 + (res == 7))
 323             return -1;
 324         vbm_tree[res * 3 + cntr[res]++] = n;
 325     }
 326     INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
 327                     bits, 1, 1, codes, 2, 2, 132);
 328     return 0;
 329 }
 330
 331 /**
 332  * Set up decoder with parameters from demuxer (extradata etc.).
 333  */
 334 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
 335 {
 336     int n, flags, pitch_range, lsp16_flag;
 337     WMAVoiceContext *s = ctx->priv_data;
 338
 339     /**
 340      * Extradata layout:
 341      * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
 342      * - byte 19-22: flags field (annoyingly in LE; see below for known
 343      *               values),
 344      * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
 345      *               rest is 0).
 346      */
 347     if (ctx->extradata_size != 46) {
 348         av_log(ctx, AV_LOG_ERROR,
 349                "Invalid extradata size %d (should be 46)\n",
 350                ctx->extradata_size);
 351         return -1;
 352     }
 353     flags                = AV_RL32(ctx->extradata + 18);
 354     s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
 355     s->do_apf            =    flags & 0x1;
 356     if (s->do_apf) {
 357         ff_rdft_init(&s->rdft,  7, DFT_R2C);
 358         ff_rdft_init(&s->irdft, 7, IDFT_C2R);
 359         ff_dct_init(&s->dct,  6, DCT_I);
 360         ff_dct_init(&s->dst,  6, DST_I);
 361
 362         ff_sine_window_init(s->cos, 256);
 363         memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
 364         for (n = 0; n < 255; n++) {
 365             s->sin[n]       = -s->sin[510 - n];
 366             s->cos[510 - n] =  s->cos[n];
 367         }
 368     }
 369     s->denoise_strength  =   (flags >> 2) & 0xF;
 370     if (s->denoise_strength >= 12) {
 371         av_log(ctx, AV_LOG_ERROR,
 372                "Invalid denoise filter strength %d (max=11)\n",
 373                s->denoise_strength);
 374         return -1;
 375     }
 376     s->denoise_tilt_corr = !!(flags & 0x40);
 377     s->dc_level          =   (flags >> 7) & 0xF;
 378     s->lsp_q_mode        = !!(flags & 0x2000);
 379     s->lsp_def_mode      = !!(flags & 0x4000);
 380     lsp16_flag           =    flags & 0x1000;
 381     if (lsp16_flag) {
 382         s->lsps               = 16;
 383         s->frame_lsp_bitsize  = 34;
 384         s->sframe_lsp_bitsize = 60;
 385     } else {
 386         s->lsps               = 10;
 387         s->frame_lsp_bitsize  = 24;
 388         s->sframe_lsp_bitsize = 48;
 389     }
 390     for (n = 0; n < s->lsps; n++)
 391         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 392
 393     init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
 394     if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
 395         av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
 396         return -1;
 397     }
 398
 399     s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
 400     s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
 401     pitch_range         = s->max_pitch_val - s->min_pitch_val;
 402     s->pitch_nbits      = av_ceil_log2(pitch_range);
 403     s->last_pitch_val   = 40;
 404     s->last_acb_type    = ACB_TYPE_NONE;
 405     s->history_nsamples = s->max_pitch_val + 8;
 406
 407     if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
 408         int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
 409             max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
 410
 411         av_log(ctx, AV_LOG_ERROR,
 412                "Unsupported samplerate %d (min=%d, max=%d)\n",
 413                ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
 414
 415         return -1;
 416     }
 417
 418     s->block_conv_table[0]      = s->min_pitch_val;
 419     s->block_conv_table[1]      = (pitch_range * 25) >> 6;
 420     s->block_conv_table[2]      = (pitch_range * 44) >> 6;
 421     s->block_conv_table[3]      = s->max_pitch_val - 1;
 422     s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
 423     s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
 424     s->block_pitch_range        = s->block_conv_table[2] +
 425                                   s->block_conv_table[3] + 1 +
 426                                   2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
 427     s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
 428
 429     ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
 430
 431     return 0;
 432 }
 433
 434 /**
 435  * @defgroup postfilter Postfilter functions
 436  * Postfilter functions (gain control, wiener denoise filter, DC filter,
 437  * kalman smoothening, plus surrounding code to wrap it)
 438  * @{
 439  */
 440 /**
 441  * Adaptive gain control (as used in postfilter).
 442  *
 443  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
 444  * that the energy here is calculated using sum(abs(...)), whereas the
 445  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
 446  *
 447  * @param out output buffer for filtered samples
 448  * @param in input buffer containing the samples as they are after the
 449  *           postfilter steps so far
 450  * @param speech_synth input buffer containing speech synth before postfilter
 451  * @param size input buffer size
 452  * @param alpha exponential filter factor
 453  * @param gain_mem pointer to filter memory (single float)
 454  */
 455 static void adaptive_gain_control(float *out, const float *in,
 456                                   const float *speech_synth,
 457                                   int size, float alpha, float *gain_mem)
 458 {
 459     int i;
 460     float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
 461     float mem = *gain_mem;
 462
 463     for (i = 0; i < size; i++) {
 464         speech_energy     += fabsf(speech_synth[i]);
 465         postfilter_energy += fabsf(in[i]);
 466     }
 467     gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
 468
 469     for (i = 0; i < size; i++) {
 470         mem = alpha * mem + gain_scale_factor;
 471         out[i] = in[i] * mem;
 472     }
 473
 474     *gain_mem = mem;
 475 }
 476
 477 /**
 478  * Kalman smoothing function.
 479  *
 480  * This function looks back pitch +/- 3 samples back into history to find
 481  * the best fitting curve (that one giving the optimal gain of the two
 482  * signals, i.e. the highest dot product between the two), and then
 483  * uses that signal history to smoothen the output of the speech synthesis
 484  * filter.
 485  *
 486  * @param s WMA Voice decoding context
 487  * @param pitch pitch of the speech signal
 488  * @param in input speech signal
 489  * @param out output pointer for smoothened signal
 490  * @param size input/output buffer size
 491  *
 492  * @returns -1 if no smoothening took place, e.g. because no optimal
 493  *          fit could be found, or 0 on success.
 494  */
 495 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
 496                            const float *in, float *out, int size)
 497 {
 498     int n;
 499     float optimal_gain = 0, dot;
 500     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
 501                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
 502                 *best_hist_ptr;
 503
 504     /* find best fitting point in history */
 505     do {
 506         dot = ff_dot_productf(in, ptr, size);
 507         if (dot > optimal_gain) {
 508             optimal_gain  = dot;
 509             best_hist_ptr = ptr;
 510         }
 511     } while (--ptr >= end);
 512
 513     if (optimal_gain <= 0)
 514         return -1;
 515     dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size);
 516     if (dot <= 0) // would be 1.0
 517         return -1;
 518
 519     if (optimal_gain <= dot) {
 520         dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
 521     } else
 522         dot = 0.625;
 523
 524     /* actual smoothing */
 525     for (n = 0; n < size; n++)
 526         out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
 527
 528     return 0;
 529 }
 530
 531 /**
 532  * Get the tilt factor of a formant filter from its transfer function
 533  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
 534  *      but somehow (??) it does a speech synthesis filter in the
 535  *      middle, which is missing here
 536  *
 537  * @param lpcs LPC coefficients
 538  * @param n_lpcs Size of LPC buffer
 539  * @returns the tilt factor
 540  */
 541 static float tilt_factor(const float *lpcs, int n_lpcs)
 542 {
 543     float rh0, rh1;
 544
 545     rh0 = 1.0     + ff_dot_productf(lpcs,  lpcs,    n_lpcs);
 546     rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1);
 547
 548     return rh1 / rh0;
 549 }
 550
 551 /**
 552  * Derive denoise filter coefficients (in real domain) from the LPCs.
 553  */
 554 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
 555                                 int fcb_type, float *coeffs, int remainder)
 556 {
 557     float last_coeff, min = 15.0, max = -15.0;
 558     float irange, angle_mul, gain_mul, range, sq;
 559     int n, idx;
 560
 561     /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
 562     s->rdft.rdft_calc(&s->rdft, lpcs);
 563 #define log_range(var, assign) do { \
 564         float tmp = log10f(assign);  var = tmp; \
 565         max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
 566     } while (0)
 567     log_range(last_coeff,  lpcs[1]         * lpcs[1]);
 568     for (n = 1; n < 64; n++)
 569         log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
 570                            lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
 571     log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
 572 #undef log_range
 573     range    = max - min;
 574     lpcs[64] = last_coeff;
 575
 576     /* Now, use this spectrum to pick out these frequencies with higher
 577      * (relative) power/energy (which we then take to be "not noise"),
 578      * and set up a table (still in lpc[]) of (relative) gains per frequency.
 579      * These frequencies will be maintained, while others ("noise") will be
 580      * decreased in the filter output. */
 581     irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
 582     gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
 583                                                           (5.0 / 14.7));
 584     angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
 585     for (n = 0; n <= 64; n++) {
 586         float pwr;
 587
 588         idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
 589         pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
 590         lpcs[n] = angle_mul * pwr;
 591
 592         /* 70.57 =~ 1/log10(1.0331663) */
 593         idx = (pwr * gain_mul - 0.0295) * 70.570526123;
 594         if (idx > 127) { // fallback if index falls outside table range
 595             coeffs[n] = wmavoice_energy_table[127] *
 596                         powf(1.0331663, idx - 127);
 597         } else
 598             coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
 599     }
 600
 601     /* calculate the Hilbert transform of the gains, which we do (since this
 602      * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
 603      * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
 604      * "moment" of the LPCs in this filter. */
 605     s->dct.dct_calc(&s->dct, lpcs);
 606     s->dst.dct_calc(&s->dst, lpcs);
 607
 608     /* Split out the coefficient indexes into phase/magnitude pairs */
 609     idx = 255 + av_clip(lpcs[64],               -255, 255);
 610     coeffs[0]  = coeffs[0]  * s->cos[idx];
 611     idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
 612     last_coeff = coeffs[64] * s->cos[idx];
 613     for (n = 63;; n--) {
 614         idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 615         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 616         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 617
 618         if (!--n) break;
 619
 620         idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 621         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 622         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 623     }
 624     coeffs[1] = last_coeff;
 625
 626     /* move into real domain */
 627     s->irdft.rdft_calc(&s->irdft, coeffs);
 628
 629     /* tilt correction and normalize scale */
 630     memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
 631     if (s->denoise_tilt_corr) {
 632         float tilt_mem = 0;
 633
 634         coeffs[remainder - 1] = 0;
 635         ff_tilt_compensation(&tilt_mem,
 636                              -1.8 * tilt_factor(coeffs, remainder - 1),
 637                              coeffs, remainder);
 638     }
 639     sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder));
 640     for (n = 0; n < remainder; n++)
 641         coeffs[n] *= sq;
 642 }
 643
 644 /**
 645  * This function applies a Wiener filter on the (noisy) speech signal as
 646  * a means to denoise it.
 647  *
 648  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
 649  * - using this power spectrum, calculate (for each frequency) the Wiener
 650  *    filter gain, which depends on the frequency power and desired level
 651  *    of noise subtraction (when set too high, this leads to artifacts)
 652  *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
 653  *    of 4-8kHz);
 654  * - by doing a phase shift, calculate the Hilbert transform of this array
 655  *    of per-frequency filter-gains to get the filtering coefficients;
 656  * - smoothen/normalize/de-tilt these filter coefficients as desired;
 657  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
 658  *    to get the denoised speech signal;
 659  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
 660  *    the frame boundary) are saved and applied to subsequent frames by an
 661  *    overlap-add method (otherwise you get clicking-artifacts).
 662  *
 663  * @param s WMA Voice decoding context
 664  * @param fcb_type Frame (codebook) type
 665  * @param synth_pf input: the noisy speech signal, output: denoised speech
 666  *                 data; should be 16-byte aligned (for ASM purposes)
 667  * @param size size of the speech data
 668  * @param lpcs LPCs used to synthesize this frame's speech data
 669  */
 670 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
 671                            float *synth_pf, int size,
 672                            const float *lpcs)
 673 {
 674     int remainder, lim, n;
 675
 676     if (fcb_type != FCB_TYPE_SILENCE) {
 677         float *tilted_lpcs = s->tilted_lpcs_pf,
 678               *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
 679
 680         tilted_lpcs[0]           = 1.0;
 681         memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
 682         memset(&tilted_lpcs[s->lsps + 1], 0,
 683                sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
 684         ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
 685                              tilted_lpcs, s->lsps + 2);
 686
 687         /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
 688          * size is applied to the next frame. All input beyond this is zero,
 689          * and thus all output beyond this will go towards zero, hence we can
 690          * limit to min(size-1, 127-size) as a performance consideration. */
 691         remainder = FFMIN(127 - size, size - 1);
 692         calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
 693
 694         /* apply coefficients (in frequency spectrum domain), i.e. complex
 695          * number multiplication */
 696         memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
 697         s->rdft.rdft_calc(&s->rdft, synth_pf);
 698         s->rdft.rdft_calc(&s->rdft, coeffs);
 699         synth_pf[0] *= coeffs[0];
 700         synth_pf[1] *= coeffs[1];
 701         for (n = 1; n < 64; n++) {
 702             float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
 703             synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
 704             synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
 705         }
 706         s->irdft.rdft_calc(&s->irdft, synth_pf);
 707     }
 708
 709     /* merge filter output with the history of previous runs */
 710     if (s->denoise_filter_cache_size) {
 711         lim = FFMIN(s->denoise_filter_cache_size, size);
 712         for (n = 0; n < lim; n++)
 713             synth_pf[n] += s->denoise_filter_cache[n];
 714         s->denoise_filter_cache_size -= lim;
 715         memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
 716                 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
 717     }
 718
 719     /* move remainder of filter output into a cache for future runs */
 720     if (fcb_type != FCB_TYPE_SILENCE) {
 721         lim = FFMIN(remainder, s->denoise_filter_cache_size);
 722         for (n = 0; n < lim; n++)
 723             s->denoise_filter_cache[n] += synth_pf[size + n];
 724         if (lim < remainder) {
 725             memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
 726                    sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
 727             s->denoise_filter_cache_size = remainder;
 728         }
 729     }
 730 }
 731
 732 /**
 733  * Averaging projection filter, the postfilter used in WMAVoice.
 734  *
 735  * This uses the following steps:
 736  * - A zero-synthesis filter (generate excitation from synth signal)
 737  * - Kalman smoothing on excitation, based on pitch
 738  * - Re-synthesized smoothened output
 739  * - Iterative Wiener denoise filter
 740  * - Adaptive gain filter
 741  * - DC filter
 742  *
 743  * @param s WMAVoice decoding context
 744  * @param synth Speech synthesis output (before postfilter)
 745  * @param samples Output buffer for filtered samples
 746  * @param size Buffer size of synth & samples
 747  * @param lpcs Generated LPCs used for speech synthesis
 748  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
 749  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
 750  * @param pitch Pitch of the input signal
 751  */
 752 static void postfilter(WMAVoiceContext *s, const float *synth,
 753                        float *samples,    int size,
 754                        const float *lpcs, float *zero_exc_pf,
 755                        int fcb_type,      int pitch)
 756 {
 757     float synth_filter_in_buf[MAX_FRAMESIZE / 2],
 758           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
 759           *synth_filter_in = zero_exc_pf;
 760
 761     assert(size <= MAX_FRAMESIZE / 2);
 762
 763     /* generate excitation from input signal */
 764     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
 765
 766     if (fcb_type >= FCB_TYPE_AW_PULSES &&
 767         !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
 768         synth_filter_in = synth_filter_in_buf;
 769
 770     /* re-synthesize speech after smoothening, and keep history */
 771     ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
 772                                  synth_filter_in, size, s->lsps);
 773     memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
 774            sizeof(synth_pf[0]) * s->lsps);
 775
 776     wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
 777
 778     adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
 779                           &s->postfilter_agc);
 780
 781     if (s->dc_level > 8) {
 782         /* remove ultra-low frequency DC noise / highpass filter;
 783          * coefficients are identical to those used in SIPR decoding,
 784          * and very closely resemble those used in AMR-NB decoding. */
 785         ff_acelp_apply_order_2_transfer_function(samples, samples,
 786             (const float[2]) { -1.99997,      1.0 },
 787             (const float[2]) { -1.9330735188, 0.93589198496 },
 788             0.93980580475, s->dcf_mem, size);
 789     }
 790 }
 791 /**
 792  * @}
 793  */
 794
 795 /**
 796  * Dequantize LSPs
 797  * @param lsps output pointer to the array that will hold the LSPs
 798  * @param num number of LSPs to be dequantized
 799  * @param values quantized values, contains n_stages values
 800  * @param sizes range (i.e. max value) of each quantized value
 801  * @param n_stages number of dequantization runs
 802  * @param table dequantization table to be used
 803  * @param mul_q LSF multiplier
 804  * @param base_q base (lowest) LSF values
 805  */
 806 static void dequant_lsps(double *lsps, int num,
 807                          const uint16_t *values,
 808                          const uint16_t *sizes,
 809                          int n_stages, const uint8_t *table,
 810                          const double *mul_q,
 811                          const double *base_q)
 812 {
 813     int n, m;
 814
 815     memset(lsps, 0, num * sizeof(*lsps));
 816     for (n = 0; n < n_stages; n++) {
 817         const uint8_t *t_off = &table[values[n] * num];
 818         double base = base_q[n], mul = mul_q[n];
 819
 820         for (m = 0; m < num; m++)
 821             lsps[m] += base + mul * t_off[m];
 822
 823         table += sizes[n] * num;
 824     }
 825 }
 826
 827 /**
 828  * @defgroup lsp_dequant LSP dequantization routines
 829  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
 830  * @note we assume enough bits are available, caller should check.
 831  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
 832  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
 833  * @{
 834  */
 835 /**
 836  * Parse 10 independently-coded LSPs.
 837  */
 838 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
 839 {
 840     static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
 841     static const double mul_lsf[4] = {
 842         5.2187144800e-3,    1.4626986422e-3,
 843         9.6179549166e-4,    1.1325736225e-3
 844     };
 845     static const double base_lsf[4] = {
 846         M_PI * -2.15522e-1, M_PI * -6.1646e-2,
 847         M_PI * -3.3486e-2,  M_PI * -5.7408e-2
 848     };
 849     uint16_t v[4];
 850
 851     v[0] = get_bits(gb, 8);
 852     v[1] = get_bits(gb, 6);
 853     v[2] = get_bits(gb, 5);
 854     v[3] = get_bits(gb, 5);
 855
 856     dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
 857                  mul_lsf, base_lsf);
 858 }
 859
 860 /**
 861  * Parse 10 independently-coded LSPs, and then derive the tables to
 862  * generate LSPs for the other frames from them (residual coding).
 863  */
 864 static void dequant_lsp10r(GetBitContext *gb,
 865                            double *i_lsps, const double *old,
 866                            double *a1, double *a2, int q_mode)
 867 {
 868     static const uint16_t vec_sizes[3] = { 128, 64, 64 };
 869     static const double mul_lsf[3] = {
 870         2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
 871     };
 872     static const double base_lsf[3] = {
 873         M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
 874     };
 875     const float (*ipol_tab)[2][10] = q_mode ?
 876         wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
 877     uint16_t interpol, v[3];
 878     int n;
 879
 880     dequant_lsp10i(gb, i_lsps);
 881
 882     interpol = get_bits(gb, 5);
 883     v[0]     = get_bits(gb, 7);
 884     v[1]     = get_bits(gb, 6);
 885     v[2]     = get_bits(gb, 6);
 886
 887     for (n = 0; n < 10; n++) {
 888         double delta = old[n] - i_lsps[n];
 889         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
 890         a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
 891     }
 892
 893     dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
 894                  mul_lsf, base_lsf);
 895 }
 896
 897 /**
 898  * Parse 16 independently-coded LSPs.
 899  */
 900 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
 901 {
 902     static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
 903     static const double mul_lsf[5] = {
 904         3.3439586280e-3,    6.9908173703e-4,
 905         3.3216608306e-3,    1.0334960326e-3,
 906         3.1899104283e-3
 907     };
 908     static const double base_lsf[5] = {
 909         M_PI * -1.27576e-1, M_PI * -2.4292e-2,
 910         M_PI * -1.28094e-1, M_PI * -3.2128e-2,
 911         M_PI * -1.29816e-1
 912     };
 913     uint16_t v[5];
 914
 915     v[0] = get_bits(gb, 8);
 916     v[1] = get_bits(gb, 6);
 917     v[2] = get_bits(gb, 7);
 918     v[3] = get_bits(gb, 6);
 919     v[4] = get_bits(gb, 7);
 920
 921     dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
 922                  wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
 923     dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
 924                  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
 925     dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
 926                  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
 927 }
 928
 929 /**
 930  * Parse 16 independently-coded LSPs, and then derive the tables to
 931  * generate LSPs for the other frames from them (residual coding).
 932  */
 933 static void dequant_lsp16r(GetBitContext *gb,
 934                            double *i_lsps, const double *old,
 935                            double *a1, double *a2, int q_mode)
 936 {
 937     static const uint16_t vec_sizes[3] = { 128, 128, 128 };
 938     static const double mul_lsf[3] = {
 939         1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
 940     };
 941     static const double base_lsf[3] = {
 942         M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
 943     };
 944     const float (*ipol_tab)[2][16] = q_mode ?
 945         wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
 946     uint16_t interpol, v[3];
 947     int n;
 948
 949     dequant_lsp16i(gb, i_lsps);
 950
 951     interpol = get_bits(gb, 5);
 952     v[0]     = get_bits(gb, 7);
 953     v[1]     = get_bits(gb, 7);
 954     v[2]     = get_bits(gb, 7);
 955
 956     for (n = 0; n < 16; n++) {
 957         double delta = old[n] - i_lsps[n];
 958         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
 959         a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
 960     }
 961
 962     dequant_lsps( a2,     10,  v,     vec_sizes,    1,
 963                  wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
 964     dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
 965                  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
 966     dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
 967                  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
 968 }
 969
 970 /**
 971  * @}
 972  * @defgroup aw Pitch-adaptive window coding functions
 973  * The next few functions are for pitch-adaptive window coding.
 974  * @{
 975  */
 976 /**
 977  * Parse the offset of the first pitch-adaptive window pulses, and
 978  * the distribution of pulses between the two blocks in this frame.
 979  * @param s WMA Voice decoding context private data
 980  * @param gb bit I/O context
 981  * @param pitch pitch for each block in this frame
 982  */
 983 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
 984                             const int *pitch)
 985 {
 986     static const int16_t start_offset[94] = {
 987         -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
 988          13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
 989          27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
 990          45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
 991          69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
 992          93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
 993         117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
 994         141, 143, 145, 147, 149, 151, 153, 155, 157, 159
 995     };
 996     int bits, offset;
 997
 998     /* position of pulse */
 999     s->aw_idx_is_ext = 0;
1000     if ((bits = get_bits(gb, 6)) >= 54) {
1001         s->aw_idx_is_ext = 1;
1002         bits += (bits - 54) * 3 + get_bits(gb, 2);
1003     }
1004
1005     /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1006      * the distribution of the pulses in each block contained in this frame. */
1007     s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1008     for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1009     s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1010     s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1011     offset                  += s->aw_n_pulses[0] * pitch[0];
1012     s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1013     s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1014
1015     /* if continuing from a position before the block, reset position to
1016      * start of block (when corrected for the range over which it can be
1017      * spread in aw_pulse_set1()). */
1018     if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1019         while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1020             s->aw_first_pulse_off[1] -= pitch[1];
1021         if (start_offset[bits] < 0)
1022             while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1023                 s->aw_first_pulse_off[0] -= pitch[0];
1024     }
1025 }
1026
1027 /**
1028  * Apply second set of pitch-adaptive window pulses.
1029  * @param s WMA Voice decoding context private data
1030  * @param gb bit I/O context
1031  * @param block_idx block index in frame [0, 1]
1032  * @param fcb structure containing fixed codebook vector info
1033  */
1034 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1035                           int block_idx, AMRFixed *fcb)
1036 {
1037     uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1038     uint16_t *use_mask = use_mask_mem + 2;
1039     /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1040      * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1041      * of idx are the position of the bit within a particular item in the
1042      * array (0 being the most significant bit, and 15 being the least
1043      * significant bit), and the remainder (>> 4) is the index in the
1044      * use_mask[]-array. This is faster and uses less memory than using a
1045      * 80-byte/80-int array. */
1046     int pulse_off = s->aw_first_pulse_off[block_idx],
1047         pulse_start, n, idx, range, aidx, start_off = 0;
1048
1049     /* set offset of first pulse to within this block */
1050     if (s->aw_n_pulses[block_idx] > 0)
1051         while (pulse_off + s->aw_pulse_range < 1)
1052             pulse_off += fcb->pitch_lag;
1053
1054     /* find range per pulse */
1055     if (s->aw_n_pulses[0] > 0) {
1056         if (block_idx == 0) {
1057             range = 32;
1058         } else /* block_idx = 1 */ {
1059             range = 8;
1060             if (s->aw_n_pulses[block_idx] > 0)
1061                 pulse_off = s->aw_next_pulse_off_cache;
1062         }
1063     } else
1064         range = 16;
1065     pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1066
1067     /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1068      * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1069      * we exclude that range from being pulsed again in this function. */
1070     memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1071     memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1072     memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1073     if (s->aw_n_pulses[block_idx] > 0)
1074         for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1075             int excl_range         = s->aw_pulse_range; // always 16 or 24
1076             uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1077             int first_sh           = 16 - (idx & 15);
1078             *use_mask_ptr++       &= 0xFFFF << first_sh;
1079             excl_range            -= first_sh;
1080             if (excl_range >= 16) {
1081                 *use_mask_ptr++    = 0;
1082                 *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1083             } else
1084                 *use_mask_ptr     &= 0xFFFF >> excl_range;
1085         }
1086
1087     /* find the 'aidx'th offset that is not excluded */
1088     aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1089     for (n = 0; n <= aidx; pulse_start++) {
1090         for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1091         if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1092             if (use_mask[0])      idx = 0x0F;
1093             else if (use_mask[1]) idx = 0x1F;
1094             else if (use_mask[2]) idx = 0x2F;
1095             else if (use_mask[3]) idx = 0x3F;
1096             else if (use_mask[4]) idx = 0x4F;
1097             else                  return;
1098             idx -= av_log2_16bit(use_mask[idx >> 4]);
1099         }
1100         if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1101             use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1102             n++;
1103             start_off = idx;
1104         }
1105     }
1106
1107     fcb->x[fcb->n] = start_off;
1108     fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1109     fcb->n++;
1110
1111     /* set offset for next block, relative to start of that block */
1112     n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1113     s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1114 }
1115
1116 /**
1117  * Apply first set of pitch-adaptive window pulses.
1118  * @param s WMA Voice decoding context private data
1119  * @param gb bit I/O context
1120  * @param block_idx block index in frame [0, 1]
1121  * @param fcb storage location for fixed codebook pulse info
1122  */
1123 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1124                           int block_idx, AMRFixed *fcb)
1125 {
1126     int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1127     float v;
1128
1129     if (s->aw_n_pulses[block_idx] > 0) {
1130         int n, v_mask, i_mask, sh, n_pulses;
1131
1132         if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1133             n_pulses = 3;
1134             v_mask   = 8;
1135             i_mask   = 7;
1136             sh       = 4;
1137         } else { // 4 pulses, 1:sign + 2:index each
1138             n_pulses = 4;
1139             v_mask   = 4;
1140             i_mask   = 3;
1141             sh       = 3;
1142         }
1143
1144         for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1145             fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1146             fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1147                                  s->aw_first_pulse_off[block_idx];
1148             while (fcb->x[fcb->n] < 0)
1149                 fcb->x[fcb->n] += fcb->pitch_lag;
1150             if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1151                 fcb->n++;
1152         }
1153     } else {
1154         int num2 = (val & 0x1FF) >> 1, delta, idx;
1155
1156         if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1157         else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1158         else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1159         else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1160         v = (val & 0x200) ? -1.0 : 1.0;
1161
1162         fcb->no_repeat_mask |= 3 << fcb->n;
1163         fcb->x[fcb->n]       = idx - delta;
1164         fcb->y[fcb->n]       = v;
1165         fcb->x[fcb->n + 1]   = idx;
1166         fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1167         fcb->n              += 2;
1168     }
1169 }
1170
1171 /**
1172  * @}
1173  *
1174  * Generate a random number from frame_cntr and block_idx, which will lief
1175  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1176  * table of size 1000 of which you want to read block_size entries).
1177  *
1178  * @param frame_cntr current frame number
1179  * @param block_num current block index
1180  * @param block_size amount of entries we want to read from a table
1181  *                   that has 1000 entries
1182  * @return a (non-)random number in the [0, 1000 - block_size] range.
1183  */
1184 static int pRNG(int frame_cntr, int block_num, int block_size)
1185 {
1186     /* array to simplify the calculation of z:
1187      * y = (x % 9) * 5 + 6;
1188      * z = (49995 * x) / y;
1189      * Since y only has 9 values, we can remove the division by using a
1190      * LUT and using FASTDIV-style divisions. For each of the 9 values
1191      * of y, we can rewrite z as:
1192      * z = x * (49995 / y) + x * ((49995 % y) / y)
1193      * In this table, each col represents one possible value of y, the
1194      * first number is 49995 / y, and the second is the FASTDIV variant
1195      * of 49995 % y / y. */
1196     static const unsigned int div_tbl[9][2] = {
1197         { 8332,  3 * 715827883U }, // y =  6
1198         { 4545,  0 * 390451573U }, // y = 11
1199         { 3124, 11 * 268435456U }, // y = 16
1200         { 2380, 15 * 204522253U }, // y = 21
1201         { 1922, 23 * 165191050U }, // y = 26
1202         { 1612, 23 * 138547333U }, // y = 31
1203         { 1388, 27 * 119304648U }, // y = 36
1204         { 1219, 16 * 104755300U }, // y = 41
1205         { 1086, 39 *  93368855U }  // y = 46
1206     };
1207     unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1208     if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1209                                     // so this is effectively a modulo (%)
1210     y = x - 9 * MULH(477218589, x); // x % 9
1211     z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1212                                     // z = x * 49995 / (y * 5 + 6)
1213     return z % (1000 - block_size);
1214 }
1215
1216 /**
1217  * Parse hardcoded signal for a single block.
1218  * @note see #synth_block().
1219  */
1220 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1221                                  int block_idx, int size,
1222                                  const struct frame_type_desc *frame_desc,
1223                                  float *excitation)
1224 {
1225     float gain;
1226     int n, r_idx;
1227
1228     assert(size <= MAX_FRAMESIZE);
1229
1230     /* Set the offset from which we start reading wmavoice_std_codebook */
1231     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1232         r_idx = pRNG(s->frame_cntr, block_idx, size);
1233         gain  = s->silence_gain;
1234     } else /* FCB_TYPE_HARDCODED */ {
1235         r_idx = get_bits(gb, 8);
1236         gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1237     }
1238
1239     /* Clear gain prediction parameters */
1240     memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1241
1242     /* Apply gain to hardcoded codebook and use that as excitation signal */
1243     for (n = 0; n < size; n++)
1244         excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1245 }
1246
1247 /**
1248  * Parse FCB/ACB signal for a single block.
1249  * @note see #synth_block().
1250  */
1251 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1252                                 int block_idx, int size,
1253                                 int block_pitch_sh2,
1254                                 const struct frame_type_desc *frame_desc,
1255                                 float *excitation)
1256 {
1257     static const float gain_coeff[6] = {
1258         0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1259     };
1260     float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1261     int n, idx, gain_weight;
1262     AMRFixed fcb;
1263
1264     assert(size <= MAX_FRAMESIZE / 2);
1265     memset(pulses, 0, sizeof(*pulses) * size);
1266
1267     fcb.pitch_lag      = block_pitch_sh2 >> 2;
1268     fcb.pitch_fac      = 1.0;
1269     fcb.no_repeat_mask = 0;
1270     fcb.n              = 0;
1271
1272     /* For the other frame types, this is where we apply the innovation
1273      * (fixed) codebook pulses of the speech signal. */
1274     if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1275         aw_pulse_set1(s, gb, block_idx, &fcb);
1276         aw_pulse_set2(s, gb, block_idx, &fcb);
1277     } else /* FCB_TYPE_EXC_PULSES */ {
1278         int offset_nbits = 5 - frame_desc->log_n_blocks;
1279
1280         fcb.no_repeat_mask = -1;
1281         /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1282          * (instead of double) for a subset of pulses */
1283         for (n = 0; n < 5; n++) {
1284             float sign;
1285             int pos1, pos2;
1286
1287             sign           = get_bits1(gb) ? 1.0 : -1.0;
1288             pos1           = get_bits(gb, offset_nbits);
1289             fcb.x[fcb.n]   = n + 5 * pos1;
1290             fcb.y[fcb.n++] = sign;
1291             if (n < frame_desc->dbl_pulses) {
1292                 pos2           = get_bits(gb, offset_nbits);
1293                 fcb.x[fcb.n]   = n + 5 * pos2;
1294                 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1295             }
1296         }
1297     }
1298     ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1299
1300     /* Calculate gain for adaptive & fixed codebook signal.
1301      * see ff_amr_set_fixed_gain(). */
1302     idx = get_bits(gb, 7);
1303     fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
1304                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1305     acb_gain = wmavoice_gain_codebook_acb[idx];
1306     pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1307                         -2.9957322736 /* log(0.05) */,
1308                          1.6094379124 /* log(5.0)  */);
1309
1310     gain_weight = 8 >> frame_desc->log_n_blocks;
1311     memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1312             sizeof(*s->gain_pred_err) * (6 - gain_weight));
1313     for (n = 0; n < gain_weight; n++)
1314         s->gain_pred_err[n] = pred_err;
1315
1316     /* Calculation of adaptive codebook */
1317     if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1318         int len;
1319         for (n = 0; n < size; n += len) {
1320             int next_idx_sh16;
1321             int abs_idx    = block_idx * size + n;
1322             int pitch_sh16 = (s->last_pitch_val << 16) +
1323                              s->pitch_diff_sh16 * abs_idx;
1324             int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1325             int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1326             idx            = idx_sh16 >> 16;
1327             if (s->pitch_diff_sh16) {
1328                 if (s->pitch_diff_sh16 > 0) {
1329                     next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1330                 } else
1331                     next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1332                 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1333                               1, size - n);
1334             } else
1335                 len = size;
1336
1337             ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1338                                   wmavoice_ipol1_coeffs, 17,
1339                                   idx, 9, len);
1340         }
1341     } else /* ACB_TYPE_HAMMING */ {
1342         int block_pitch = block_pitch_sh2 >> 2;
1343         idx             = block_pitch_sh2 & 3;
1344         if (idx) {
1345             ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1346                                   wmavoice_ipol2_coeffs, 4,
1347                                   idx, 8, size);
1348         } else
1349             av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1350                               sizeof(float) * size);
1351     }
1352
1353     /* Interpolate ACB/FCB and use as excitation signal */
1354     ff_weighted_vector_sumf(excitation, excitation, pulses,
1355                             acb_gain, fcb_gain, size);
1356 }
1357
1358 /**
1359  * Parse data in a single block.
1360  * @note we assume enough bits are available, caller should check.
1361  *
1362  * @param s WMA Voice decoding context private data
1363  * @param gb bit I/O context
1364  * @param block_idx index of the to-be-read block
1365  * @param size amount of samples to be read in this block
1366  * @param block_pitch_sh2 pitch for this block << 2
1367  * @param lsps LSPs for (the end of) this frame
1368  * @param prev_lsps LSPs for the last frame
1369  * @param frame_desc frame type descriptor
1370  * @param excitation target memory for the ACB+FCB interpolated signal
1371  * @param synth target memory for the speech synthesis filter output
1372  * @return 0 on success, <0 on error.
1373  */
1374 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1375                         int block_idx, int size,
1376                         int block_pitch_sh2,
1377                         const double *lsps, const double *prev_lsps,
1378                         const struct frame_type_desc *frame_desc,
1379                         float *excitation, float *synth)
1380 {
1381     double i_lsps[MAX_LSPS];
1382     float lpcs[MAX_LSPS];
1383     float fac;
1384     int n;
1385
1386     if (frame_desc->acb_type == ACB_TYPE_NONE)
1387         synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1388     else
1389         synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1390                             frame_desc, excitation);
1391
1392     /* convert interpolated LSPs to LPCs */
1393     fac = (block_idx + 0.5) / frame_desc->n_blocks;
1394     for (n = 0; n < s->lsps; n++) // LSF -> LSP
1395         i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1396     ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1397
1398     /* Speech synthesis */
1399     ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1400 }
1401
1402 /**
1403  * Synthesize output samples for a single frame.
1404  * @note we assume enough bits are available, caller should check.
1405  *
1406  * @param ctx WMA Voice decoder context
1407  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1408  * @param frame_idx Frame number within superframe [0-2]
1409  * @param samples pointer to output sample buffer, has space for at least 160
1410  *                samples
1411  * @param lsps LSP array
1412  * @param prev_lsps array of previous frame's LSPs
1413  * @param excitation target buffer for excitation signal
1414  * @param synth target buffer for synthesized speech data
1415  * @return 0 on success, <0 on error.
1416  */
1417 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1418                        float *samples,
1419                        const double *lsps, const double *prev_lsps,
1420                        float *excitation, float *synth)
1421 {
1422     WMAVoiceContext *s = ctx->priv_data;
1423     int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1424     int pitch[MAX_BLOCKS], last_block_pitch;
1425
1426     /* Parse frame type ("frame header"), see frame_descs */
1427     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
1428         block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1429
1430     if (bd_idx < 0) {
1431         av_log(ctx, AV_LOG_ERROR,
1432                "Invalid frame type VLC code, skipping\n");
1433         return -1;
1434     }
1435
1436     /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1437     if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1438         /* Pitch is provided per frame, which is interpreted as the pitch of
1439          * the last sample of the last block of this frame. We can interpolate
1440          * the pitch of other blocks (and even pitch-per-sample) by gradually
1441          * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1442         n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1443         log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1444         cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1445         cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1446         if (s->last_acb_type == ACB_TYPE_NONE ||
1447             20 * abs(cur_pitch_val - s->last_pitch_val) >
1448                 (cur_pitch_val + s->last_pitch_val))
1449             s->last_pitch_val = cur_pitch_val;
1450
1451         /* pitch per block */
1452         for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1453             int fac = n * 2 + 1;
1454
1455             pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1456                         MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1457                         frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1458         }
1459
1460         /* "pitch-diff-per-sample" for calculation of pitch per sample */
1461         s->pitch_diff_sh16 =
1462             ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1463     }
1464
1465     /* Global gain (if silence) and pitch-adaptive window coordinates */
1466     switch (frame_descs[bd_idx].fcb_type) {
1467     case FCB_TYPE_SILENCE:
1468         s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1469         break;
1470     case FCB_TYPE_AW_PULSES:
1471         aw_parse_coords(s, gb, pitch);
1472         break;
1473     }
1474
1475     for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1476         int bl_pitch_sh2;
1477
1478         /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1479         switch (frame_descs[bd_idx].acb_type) {
1480         case ACB_TYPE_HAMMING: {
1481             /* Pitch is given per block. Per-block pitches are encoded as an
1482              * absolute value for the first block, and then delta values
1483              * relative to this value) for all subsequent blocks. The scale of
1484              * this pitch value is semi-logaritmic compared to its use in the
1485              * decoder, so we convert it to normal scale also. */
1486             int block_pitch,
1487                 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1488                 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1489                 t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1490
1491             if (n == 0) {
1492                 block_pitch = get_bits(gb, s->block_pitch_nbits);
1493             } else
1494                 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1495                                  get_bits(gb, s->block_delta_pitch_nbits);
1496             /* Convert last_ so that any next delta is within _range */
1497             last_block_pitch = av_clip(block_pitch,
1498                                        s->block_delta_pitch_hrange,
1499                                        s->block_pitch_range -
1500                                            s->block_delta_pitch_hrange);
1501
1502             /* Convert semi-log-style scale back to normal scale */
1503             if (block_pitch < t1) {
1504                 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1505             } else {
1506                 block_pitch -= t1;
1507                 if (block_pitch < t2) {
1508                     bl_pitch_sh2 =
1509                         (s->block_conv_table[1] << 2) + (block_pitch << 1);
1510                 } else {
1511                     block_pitch -= t2;
1512                     if (block_pitch < t3) {
1513                         bl_pitch_sh2 =
1514                             (s->block_conv_table[2] + block_pitch) << 2;
1515                     } else
1516                         bl_pitch_sh2 = s->block_conv_table[3] << 2;
1517                 }
1518             }
1519             pitch[n] = bl_pitch_sh2 >> 2;
1520             break;
1521         }
1522
1523         case ACB_TYPE_ASYMMETRIC: {
1524             bl_pitch_sh2 = pitch[n] << 2;
1525             break;
1526         }
1527
1528         default: // ACB_TYPE_NONE has no pitch
1529             bl_pitch_sh2 = 0;
1530             break;
1531         }
1532
1533         synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1534                     lsps, prev_lsps, &frame_descs[bd_idx],
1535                     &excitation[n * block_nsamples],
1536                     &synth[n * block_nsamples]);
1537     }
1538
1539     /* Averaging projection filter, if applicable. Else, just copy samples
1540      * from synthesis buffer */
1541     if (s->do_apf) {
1542         double i_lsps[MAX_LSPS];
1543         float lpcs[MAX_LSPS];
1544
1545         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1546             i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1547         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1548         postfilter(s, synth, samples, 80, lpcs,
1549                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1550                    frame_descs[bd_idx].fcb_type, pitch[0]);
1551
1552         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1553             i_lsps[n] = cos(lsps[n]);
1554         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1555         postfilter(s, &synth[80], &samples[80], 80, lpcs,
1556                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1557                    frame_descs[bd_idx].fcb_type, pitch[0]);
1558     } else
1559         memcpy(samples, synth, 160 * sizeof(synth[0]));
1560
1561     /* Cache values for next frame */
1562     s->frame_cntr++;
1563     if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1564     s->last_acb_type = frame_descs[bd_idx].acb_type;
1565     switch (frame_descs[bd_idx].acb_type) {
1566     case ACB_TYPE_NONE:
1567         s->last_pitch_val = 0;
1568         break;
1569     case ACB_TYPE_ASYMMETRIC:
1570         s->last_pitch_val = cur_pitch_val;
1571         break;
1572     case ACB_TYPE_HAMMING:
1573         s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1574         break;
1575     }
1576
1577     return 0;
1578 }
1579
1580 /**
1581  * Ensure minimum value for first item, maximum value for last value,
1582  * proper spacing between each value and proper ordering.
1583  *
1584  * @param lsps array of LSPs
1585  * @param num size of LSP array
1586  *
1587  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1588  *       useful to put in a generic location later on. Parts are also
1589  *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1590  *       which is in float.
1591  */
1592 static void stabilize_lsps(double *lsps, int num)
1593 {
1594     int n, m, l;
1595
1596     /* set minimum value for first, maximum value for last and minimum
1597      * spacing between LSF values.
1598      * Very similar to ff_set_min_dist_lsf(), but in double. */
1599     lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1600     for (n = 1; n < num; n++)
1601         lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1602     lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1603
1604     /* reorder (looks like one-time / non-recursed bubblesort).
1605      * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1606     for (n = 1; n < num; n++) {
1607         if (lsps[n] < lsps[n - 1]) {
1608             for (m = 1; m < num; m++) {
1609                 double tmp = lsps[m];
1610                 for (l = m - 1; l >= 0; l--) {
1611                     if (lsps[l] <= tmp) break;
1612                     lsps[l + 1] = lsps[l];
1613                 }
1614                 lsps[l + 1] = tmp;
1615             }
1616             break;
1617         }
1618     }
1619 }
1620
1621 /**
1622  * Test if there's enough bits to read 1 superframe.
1623  *
1624  * @param orig_gb bit I/O context used for reading. This function
1625  *                does not modify the state of the bitreader; it
1626  *                only uses it to copy the current stream position
1627  * @param s WMA Voice decoding context private data
1628  * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1629  */
1630 static int check_bits_for_superframe(GetBitContext *orig_gb,
1631                                      WMAVoiceContext *s)
1632 {
1633     GetBitContext s_gb, *gb = &s_gb;
1634     int n, need_bits, bd_idx;
1635     const struct frame_type_desc *frame_desc;
1636
1637     /* initialize a copy */
1638     init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1639     skip_bits_long(gb, get_bits_count(orig_gb));
1640     assert(get_bits_left(gb) == get_bits_left(orig_gb));
1641
1642     /* superframe header */
1643     if (get_bits_left(gb) < 14)
1644         return 1;
1645     if (!get_bits1(gb))
1646         return -1;                        // WMAPro-in-WMAVoice superframe
1647     if (get_bits1(gb)) skip_bits(gb, 12); // number of  samples in superframe
1648     if (s->has_residual_lsps) {           // residual LSPs (for all frames)
1649         if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1650             return 1;
1651         skip_bits_long(gb, s->sframe_lsp_bitsize);
1652     }
1653
1654     /* frames */
1655     for (n = 0; n < MAX_FRAMES; n++) {
1656         int aw_idx_is_ext = 0;
1657
1658         if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
1659            if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1660            skip_bits_long(gb, s->frame_lsp_bitsize);
1661         }
1662         bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1663         if (bd_idx < 0)
1664             return -1;                   // invalid frame type VLC code
1665         frame_desc = &frame_descs[bd_idx];
1666         if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1667             if (get_bits_left(gb) < s->pitch_nbits)
1668                 return 1;
1669             skip_bits_long(gb, s->pitch_nbits);
1670         }
1671         if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1672             skip_bits(gb, 8);
1673         } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1674             int tmp = get_bits(gb, 6);
1675             if (tmp >= 0x36) {
1676                 skip_bits(gb, 2);
1677                 aw_idx_is_ext = 1;
1678             }
1679         }
1680
1681         /* blocks */
1682         if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1683             need_bits = s->block_pitch_nbits +
1684                 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1685         } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1686             need_bits = 2 * !aw_idx_is_ext;
1687         } else
1688             need_bits = 0;
1689         need_bits += frame_desc->frame_size;
1690         if (get_bits_left(gb) < need_bits)
1691             return 1;
1692         skip_bits_long(gb, need_bits);
1693     }
1694
1695     return 0;
1696 }
1697
1698 /**
1699  * Synthesize output samples for a single superframe. If we have any data
1700  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1701  * in s->gb.
1702  *
1703  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1704  * to give a total of 480 samples per frame. See #synth_frame() for frame
1705  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1706  * (if these are globally specified for all frames (residually); they can
1707  * also be specified individually per-frame. See the s->has_residual_lsps
1708  * option), and can specify the number of samples encoded in this superframe
1709  * (if less than 480), usually used to prevent blanks at track boundaries.
1710  *
1711  * @param ctx WMA Voice decoder context
1712  * @param samples pointer to output buffer for voice samples
1713  * @param data_size pointer containing the size of #samples on input, and the
1714  *                  amount of #samples filled on output
1715  * @return 0 on success, <0 on error or 1 if there was not enough data to
1716  *         fully parse the superframe
1717  */
1718 static int synth_superframe(AVCodecContext *ctx,
1719                             float *samples, int *data_size)
1720 {
1721     WMAVoiceContext *s = ctx->priv_data;
1722     GetBitContext *gb = &s->gb, s_gb;
1723     int n, res, n_samples = 480;
1724     double lsps[MAX_FRAMES][MAX_LSPS];
1725     const double *mean_lsf = s->lsps == 16 ?
1726         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1727     float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1728     float synth[MAX_LSPS + MAX_SFRAMESIZE];
1729
1730     memcpy(synth,      s->synth_history,
1731            s->lsps             * sizeof(*synth));
1732     memcpy(excitation, s->excitation_history,
1733            s->history_nsamples * sizeof(*excitation));
1734
1735     if (s->sframe_cache_size > 0) {
1736         gb = &s_gb;
1737         init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1738         s->sframe_cache_size = 0;
1739     }
1740
1741     if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
1742
1743     /* First bit is speech/music bit, it differentiates between WMAVoice
1744      * speech samples (the actual codec) and WMAVoice music samples, which
1745      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1746      * the wild yet. */
1747     if (!get_bits1(gb)) {
1748         av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1749         return -1;
1750     }
1751
1752     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1753     if (get_bits1(gb)) {
1754         if ((n_samples = get_bits(gb, 12)) > 480) {
1755             av_log(ctx, AV_LOG_ERROR,
1756                    "Superframe encodes >480 samples (%d), not allowed\n",
1757                    n_samples);
1758             return -1;
1759         }
1760     }
1761     /* Parse LSPs, if global for the superframe (can also be per-frame). */
1762     if (s->has_residual_lsps) {
1763         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1764
1765         for (n = 0; n < s->lsps; n++)
1766             prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1767
1768         if (s->lsps == 10) {
1769             dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1770         } else /* s->lsps == 16 */
1771             dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1772
1773         for (n = 0; n < s->lsps; n++) {
1774             lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1775             lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1776             lsps[2][n] += mean_lsf[n];
1777         }
1778         for (n = 0; n < 3; n++)
1779             stabilize_lsps(lsps[n], s->lsps);
1780     }
1781
1782     /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */
1783     for (n = 0; n < 3; n++) {
1784         if (!s->has_residual_lsps) {
1785             int m;
1786
1787             if (s->lsps == 10) {
1788                 dequant_lsp10i(gb, lsps[n]);
1789             } else /* s->lsps == 16 */
1790                 dequant_lsp16i(gb, lsps[n]);
1791
1792             for (m = 0; m < s->lsps; m++)
1793                 lsps[n][m] += mean_lsf[m];
1794             stabilize_lsps(lsps[n], s->lsps);
1795         }
1796
1797         if ((res = synth_frame(ctx, gb, n,
1798                                &samples[n * MAX_FRAMESIZE],
1799                                lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1800                                &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1801                                &synth[s->lsps + n * MAX_FRAMESIZE])))
1802             return res;
1803     }
1804
1805     /* Statistics? FIXME - we don't check for length, a slight overrun
1806      * will be caught by internal buffer padding, and anything else
1807      * will be skipped, not read. */
1808     if (get_bits1(gb)) {
1809         res = get_bits(gb, 4);
1810         skip_bits(gb, 10 * (res + 1));
1811     }
1812
1813     /* Specify nr. of output samples */
1814     *data_size = n_samples * sizeof(float);
1815
1816     /* Update history */
1817     memcpy(s->prev_lsps,           lsps[2],
1818            s->lsps             * sizeof(*s->prev_lsps));
1819     memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1820            s->lsps             * sizeof(*synth));
1821     memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1822            s->history_nsamples * sizeof(*excitation));
1823     if (s->do_apf)
1824         memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1825                 s->history_nsamples * sizeof(*s->zero_exc_pf));
1826
1827     return 0;
1828 }
1829
1830 /**
1831  * Parse the packet header at the start of each packet (input data to this
1832  * decoder).
1833  *
1834  * @param s WMA Voice decoding context private data
1835  * @return 1 if not enough bits were available, or 0 on success.
1836  */
1837 static int parse_packet_header(WMAVoiceContext *s)
1838 {
1839     GetBitContext *gb = &s->gb;
1840     unsigned int res;
1841
1842     if (get_bits_left(gb) < 11)
1843         return 1;
1844     skip_bits(gb, 4);          // packet sequence number
1845     s->has_residual_lsps = get_bits1(gb);
1846     do {
1847         res = get_bits(gb, 6); // number of superframes per packet
1848                                // (minus first one if there is spillover)
1849         if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1850             return 1;
1851     } while (res == 0x3F);
1852     s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1853
1854     return 0;
1855 }
1856
1857 /**
1858  * Copy (unaligned) bits from gb/data/size to pb.
1859  *
1860  * @param pb target buffer to copy bits into
1861  * @param data source buffer to copy bits from
1862  * @param size size of the source data, in bytes
1863  * @param gb bit I/O context specifying the current position in the source.
1864  *           data. This function might use this to align the bit position to
1865  *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1866  *           source data
1867  * @param nbits the amount of bits to copy from source to target
1868  *
1869  * @note after calling this function, the current position in the input bit
1870  *       I/O context is undefined.
1871  */
1872 static void copy_bits(PutBitContext *pb,
1873                       const uint8_t *data, int size,
1874                       GetBitContext *gb, int nbits)
1875 {
1876     int rmn_bytes, rmn_bits;
1877
1878     rmn_bits = rmn_bytes = get_bits_left(gb);
1879     if (rmn_bits < nbits)
1880         return;
1881     rmn_bits &= 7; rmn_bytes >>= 3;
1882     if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1883         put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1884     ff_copy_bits(pb, data + size - rmn_bytes,
1885                  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1886 }
1887
1888 /**
1889  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1890  * and we expect that the demuxer / application provides it to us as such
1891  * (else you'll probably get garbage as output). Every packet has a size of
1892  * ctx->block_align bytes, starts with a packet header (see
1893  * #parse_packet_header()), and then a series of superframes. Superframe
1894  * boundaries may exceed packets, i.e. superframes can split data over
1895  * multiple (two) packets.
1896  *
1897  * For more information about frames, see #synth_superframe().
1898  */
1899 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1900                                   int *data_size, AVPacket *avpkt)
1901 {
1902     WMAVoiceContext *s = ctx->priv_data;
1903     GetBitContext *gb = &s->gb;
1904     int size, res, pos;
1905
1906     if (*data_size < 480 * sizeof(float)) {
1907         av_log(ctx, AV_LOG_ERROR,
1908                "Output buffer too small (%d given - %zu needed)\n",
1909                *data_size, 480 * sizeof(float));
1910         return -1;
1911     }
1912     *data_size = 0;
1913
1914     /* Packets are sometimes a multiple of ctx->block_align, with a packet
1915      * header at each ctx->block_align bytes. However, Libav's ASF demuxer
1916      * feeds us ASF packets, which may concatenate multiple "codec" packets
1917      * in a single "muxer" packet, so we artificially emulate that by
1918      * capping the packet size at ctx->block_align. */
1919     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1920     if (!size)
1921         return 0;
1922     init_get_bits(&s->gb, avpkt->data, size << 3);
1923
1924     /* size == ctx->block_align is used to indicate whether we are dealing with
1925      * a new packet or a packet of which we already read the packet header
1926      * previously. */
1927     if (size == ctx->block_align) { // new packet header
1928         if ((res = parse_packet_header(s)) < 0)
1929             return res;
1930
1931         /* If the packet header specifies a s->spillover_nbits, then we want
1932          * to push out all data of the previous packet (+ spillover) before
1933          * continuing to parse new superframes in the current packet. */
1934         if (s->spillover_nbits > 0) {
1935             if (s->sframe_cache_size > 0) {
1936                 int cnt = get_bits_count(gb);
1937                 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1938                 flush_put_bits(&s->pb);
1939                 s->sframe_cache_size += s->spillover_nbits;
1940                 if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
1941                     *data_size > 0) {
1942                     cnt += s->spillover_nbits;
1943                     s->skip_bits_next = cnt & 7;
1944                     return cnt >> 3;
1945                 } else
1946                     skip_bits_long (gb, s->spillover_nbits - cnt +
1947                                     get_bits_count(gb)); // resync
1948             } else
1949                 skip_bits_long(gb, s->spillover_nbits);  // resync
1950         }
1951     } else if (s->skip_bits_next)
1952         skip_bits(gb, s->skip_bits_next);
1953
1954     /* Try parsing superframes in current packet */
1955     s->sframe_cache_size = 0;
1956     s->skip_bits_next = 0;
1957     pos = get_bits_left(gb);
1958     if ((res = synth_superframe(ctx, data, data_size)) < 0) {
1959         return res;
1960     } else if (*data_size > 0) {
1961         int cnt = get_bits_count(gb);
1962         s->skip_bits_next = cnt & 7;
1963         return cnt >> 3;
1964     } else if ((s->sframe_cache_size = pos) > 0) {
1965         /* rewind bit reader to start of last (incomplete) superframe... */
1966         init_get_bits(gb, avpkt->data, size << 3);
1967         skip_bits_long(gb, (size << 3) - pos);
1968         assert(get_bits_left(gb) == pos);
1969
1970         /* ...and cache it for spillover in next packet */
1971         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1972         copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1973         // FIXME bad - just copy bytes as whole and add use the
1974         // skip_bits_next field
1975     }
1976
1977     return size;
1978 }
1979
1980 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1981 {
1982     WMAVoiceContext *s = ctx->priv_data;
1983
1984     if (s->do_apf) {
1985         ff_rdft_end(&s->rdft);
1986         ff_rdft_end(&s->irdft);
1987         ff_dct_end(&s->dct);
1988         ff_dct_end(&s->dst);
1989     }
1990
1991     return 0;
1992 }
1993
1994 static av_cold void wmavoice_flush(AVCodecContext *ctx)
1995 {
1996     WMAVoiceContext *s = ctx->priv_data;
1997     int n;
1998
1999     s->postfilter_agc    = 0;
2000     s->sframe_cache_size = 0;
2001     s->skip_bits_next    = 0;
2002     for (n = 0; n < s->lsps; n++)
2003         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2004     memset(s->excitation_history, 0,
2005            sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2006     memset(s->synth_history,      0,
2007            sizeof(*s->synth_history)      * MAX_LSPS);
2008     memset(s->gain_pred_err,      0,
2009            sizeof(s->gain_pred_err));
2010
2011     if (s->do_apf) {
2012         memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2013                sizeof(*s->synth_filter_out_buf) * s->lsps);
2014         memset(s->dcf_mem,              0,
2015                sizeof(*s->dcf_mem)              * 2);
2016         memset(s->zero_exc_pf,          0,
2017                sizeof(*s->zero_exc_pf)          * s->history_nsamples);
2018         memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2019     }
2020 }
2021
2022 AVCodec ff_wmavoice_decoder = {
2023     "wmavoice",
2024     AVMEDIA_TYPE_AUDIO,
2025     CODEC_ID_WMAVOICE,
2026     sizeof(WMAVoiceContext),
2027     wmavoice_decode_init,
2028     NULL,
2029     wmavoice_decode_end,
2030     wmavoice_decode_packet,
2031     CODEC_CAP_SUBFRAMES,
2032     .flush     = wmavoice_flush,
2033     .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2034 };