diff --git a/webrtc/modules/audio_processing/audio_processing_impl.cc b/webrtc/modules/audio_processing/audio_processing_impl.cc index e155171611..bb746ee635 100644 --- a/webrtc/modules/audio_processing/audio_processing_impl.cc +++ b/webrtc/modules/audio_processing/audio_processing_impl.cc @@ -1184,8 +1184,7 @@ bool AudioProcessingImpl::analysis_needed(bool is_data_processed) const { } bool AudioProcessingImpl::is_rev_processed() const { - return constants_.intelligibility_enabled && - public_submodules_->intelligibility_enhancer->active(); + return constants_.intelligibility_enabled; } bool AudioProcessingImpl::render_check_rev_conversion_needed() const { @@ -1236,12 +1235,9 @@ void AudioProcessingImpl::InitializeBeamformer() { void AudioProcessingImpl::InitializeIntelligibility() { if (constants_.intelligibility_enabled) { - IntelligibilityEnhancer::Config config; - config.sample_rate_hz = capture_nonlocked_.split_rate; - config.num_capture_channels = capture_.capture_audio->num_channels(); - config.num_render_channels = render_.render_audio->num_channels(); public_submodules_->intelligibility_enhancer.reset( - new IntelligibilityEnhancer(config)); + new IntelligibilityEnhancer(capture_nonlocked_.split_rate, + render_.render_audio->num_channels())); } } diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc index f0050a2ae1..8f0e7bf6b9 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc @@ -27,11 +27,16 @@ namespace { const size_t kErbResolution = 2; const int kWindowSizeMs = 16; const int kChunkSizeMs = 10; // Size provided by APM. -const float kClipFreq = 200.0f; -const float kConfigRho = 0.02f; // Default production and interpretation SNR. +const float kClipFreqKhz = 0.2f; const float kKbdAlpha = 1.5f; const float kLambdaBot = -1.0f; // Extreme values in bisection const float kLambdaTop = -10e-18f; // search for lamda. +const float kVoiceProbabilityThreshold = 0.02f; +// Number of chunks after voice activity which is still considered speech. +const size_t kSpeechOffsetDelay = 80; +const float kDecayRate = 0.98f; // Power estimation decay rate. +const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain. +const float kRho = 0.0004f; // Default production and interpretation SNR. // Returns dot product of vectors |a| and |b| with size |length|. float DotProduct(const float* a, const float* b, size_t length) { @@ -72,61 +77,46 @@ void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( } } -IntelligibilityEnhancer::IntelligibilityEnhancer() - : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) { -} - -IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config) +IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, + size_t num_render_channels) : freqs_(RealFourier::ComplexLength( - RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))), - window_size_(static_cast(1 << RealFourier::FftOrder(freqs_))), - chunk_length_( - static_cast(config.sample_rate_hz * kChunkSizeMs / 1000)), - bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)), - sample_rate_hz_(config.sample_rate_hz), - erb_resolution_(kErbResolution), - num_capture_channels_(config.num_capture_channels), - num_render_channels_(config.num_render_channels), - analysis_rate_(config.analysis_rate), - active_(true), - clear_power_(freqs_, config.decay_rate), - noise_power_(freqs_, 0.f), + RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), + chunk_length_(static_cast(sample_rate_hz * kChunkSizeMs / 1000)), + bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), + sample_rate_hz_(sample_rate_hz), + num_render_channels_(num_render_channels), + clear_power_estimator_(freqs_, kDecayRate), + noise_power_estimator_( + new intelligibility::PowerEstimator(freqs_, kDecayRate)), filtered_clear_pow_(new float[bank_size_]), filtered_noise_pow_(new float[bank_size_]), center_freqs_(new float[bank_size_]), render_filter_bank_(CreateErbBank(freqs_)), - rho_(new float[bank_size_]), gains_eq_(new float[bank_size_]), - gain_applier_(freqs_, config.gain_change_limit), + gain_applier_(freqs_, kMaxRelativeGainChange), temp_render_out_buffer_(chunk_length_, num_render_channels_), - kbd_window_(new float[window_size_]), render_callback_(this), - block_count_(0), - analysis_step_(0) { - RTC_DCHECK_LE(config.rho, 1.0f); + audio_s16_(chunk_length_), + chunks_since_voice_(kSpeechOffsetDelay), + is_speech_(false) { + RTC_DCHECK_LE(kRho, 1.f); - memset(filtered_clear_pow_.get(), - 0, + memset(filtered_clear_pow_.get(), 0, bank_size_ * sizeof(filtered_clear_pow_[0])); - memset(filtered_noise_pow_.get(), - 0, + memset(filtered_noise_pow_.get(), 0, bank_size_ * sizeof(filtered_noise_pow_[0])); - // Assumes all rho equal. - for (size_t i = 0; i < bank_size_; ++i) { - rho_[i] = config.rho * config.rho; - } + const size_t erb_index = static_cast( + ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + + 43.f)); + start_freq_ = std::max(static_cast(1), erb_index * kErbResolution); - float freqs_khz = kClipFreq / 1000.0f; - size_t erb_index = static_cast(ceilf( - 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f)); - start_freq_ = std::max(static_cast(1), erb_index * erb_resolution_); - - WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_, - kbd_window_.get()); + size_t window_size = static_cast(1 << RealFourier::FftOrder(freqs_)); + std::vector kbd_window(window_size); + WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); render_mangler_.reset(new LappedTransform( - num_render_channels_, num_render_channels_, chunk_length_, - kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_)); + num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], + window_size, window_size / 2, &render_callback_)); } void IntelligibilityEnhancer::SetCaptureNoiseEstimate( @@ -134,13 +124,10 @@ void IntelligibilityEnhancer::SetCaptureNoiseEstimate( if (capture_filter_bank_.size() != bank_size_ || capture_filter_bank_[0].size() != noise.size()) { capture_filter_bank_ = CreateErbBank(noise.size()); + noise_power_estimator_.reset( + new intelligibility::PowerEstimator(noise.size(), kDecayRate)); } - if (noise.size() != noise_power_.size()) { - noise_power_.resize(noise.size()); - } - for (size_t i = 0; i < noise.size(); ++i) { - noise_power_[i] = noise[i] * noise[i]; - } + noise_power_estimator_->Step(&noise[0]); } void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, @@ -148,54 +135,29 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, size_t num_channels) { RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); RTC_CHECK_EQ(num_render_channels_, num_channels); - - if (active_) { - render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); - } - - if (active_) { - for (size_t i = 0; i < num_render_channels_; ++i) { - memcpy(audio[i], temp_render_out_buffer_.channels()[i], - chunk_length_ * sizeof(**audio)); - } + is_speech_ = IsSpeech(audio[0]); + render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); + for (size_t i = 0; i < num_render_channels_; ++i) { + memcpy(audio[i], temp_render_out_buffer_.channels()[i], + chunk_length_ * sizeof(**audio)); } } void IntelligibilityEnhancer::ProcessClearBlock( const std::complex* in_block, std::complex* out_block) { - if (block_count_ < 2) { - memset(out_block, 0, freqs_ * sizeof(*out_block)); - ++block_count_; - return; + if (is_speech_) { + clear_power_estimator_.Step(in_block); } - - // TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary. - if (true) { - clear_power_.Step(in_block); - if (block_count_ % analysis_rate_ == analysis_rate_ - 1) { - AnalyzeClearBlock(); - ++analysis_step_; - } - ++block_count_; - } - - if (active_) { - gain_applier_.Apply(in_block, out_block); - } -} - -void IntelligibilityEnhancer::AnalyzeClearBlock() { - const float* clear_power = clear_power_.Power(); - MapToErbBands(clear_power, - render_filter_bank_, + const std::vector& clear_power = clear_power_estimator_.power(); + const std::vector& noise_power = noise_power_estimator_->power(); + MapToErbBands(&clear_power[0], render_filter_bank_, filtered_clear_pow_.get()); - MapToErbBands(&noise_power_[0], - capture_filter_bank_, + MapToErbBands(&noise_power[0], capture_filter_bank_, filtered_noise_pow_.get()); SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); - const float power_target = std::accumulate( - clear_power, clear_power + freqs_, 0.f); + const float power_target = + std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); const float power_top = DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); @@ -205,6 +167,7 @@ void IntelligibilityEnhancer::AnalyzeClearBlock() { SolveForLambda(power_target, power_bot, power_top); UpdateErbGains(); } // Else experiencing power underflow, so do nothing. + gain_applier_.Apply(in_block, out_block); } void IntelligibilityEnhancer::SolveForLambda(float power_target, @@ -217,11 +180,10 @@ void IntelligibilityEnhancer::SolveForLambda(float power_target, 1.f / (power_target + std::numeric_limits::epsilon()); float lambda_bot = kLambdaBot; float lambda_top = kLambdaTop; - float power_ratio = 2.0f; // Ratio of achieved power to target power. + float power_ratio = 2.f; // Ratio of achieved power to target power. int iters = 0; - while (std::fabs(power_ratio - 1.0f) > kConvergeThresh && - iters <= kMaxIters) { - const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f; + while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { + const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f; SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); const float power = DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); @@ -239,7 +201,7 @@ void IntelligibilityEnhancer::UpdateErbGains() { // (ERB gain) = filterbank' * (freq gain) float* gains = gain_applier_.target(); for (size_t i = 0; i < freqs_; ++i) { - gains[i] = 0.0f; + gains[i] = 0.f; for (size_t j = 0; j < bank_size_; ++j) { gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]); } @@ -248,9 +210,9 @@ void IntelligibilityEnhancer::UpdateErbGains() { size_t IntelligibilityEnhancer::GetBankSize(int sample_rate, size_t erb_resolution) { - float freq_limit = sample_rate / 2000.0f; + float freq_limit = sample_rate / 2000.f; size_t erb_scale = static_cast(ceilf( - 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f)); + 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f)); return erb_scale * erb_resolution; } @@ -260,7 +222,7 @@ std::vector> IntelligibilityEnhancer::CreateErbBank( size_t lf = 1, rf = 4; for (size_t i = 0; i < bank_size_; ++i) { - float abs_temp = fabsf((i + 1.0f) / static_cast(erb_resolution_)); + float abs_temp = fabsf((i + 1.f) / static_cast(kErbResolution)); center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); center_freqs_[i] -= 14678.49f; } @@ -274,48 +236,43 @@ std::vector> IntelligibilityEnhancer::CreateErbBank( } for (size_t i = 1; i <= bank_size_; ++i) { - size_t lll, ll, rr, rrr; static const size_t kOne = 1; // Avoids repeated static_cast<>s below. - lll = static_cast(round( - center_freqs_[std::max(kOne, i - lf) - 1] * num_freqs / - (0.5f * sample_rate_hz_))); - ll = static_cast(round( - center_freqs_[std::max(kOne, i) - 1] * num_freqs / - (0.5f * sample_rate_hz_))); + size_t lll = + static_cast(round(center_freqs_[std::max(kOne, i - lf) - 1] * + num_freqs / (0.5f * sample_rate_hz_))); + size_t ll = static_cast(round(center_freqs_[std::max(kOne, i) - 1] * + num_freqs / (0.5f * sample_rate_hz_))); lll = std::min(num_freqs, std::max(lll, kOne)) - 1; ll = std::min(num_freqs, std::max(ll, kOne)) - 1; - rrr = static_cast(round( - center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs / - (0.5f * sample_rate_hz_))); - rr = static_cast(round( - center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs / - (0.5f * sample_rate_hz_))); + size_t rrr = static_cast( + round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs / + (0.5f * sample_rate_hz_))); + size_t rr = static_cast( + round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs / + (0.5f * sample_rate_hz_))); rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1; rr = std::min(num_freqs, std::max(rr, kOne)) - 1; - float step, element; - - step = ll == lll ? 0.f : 1.f / (ll - lll); - element = 0.0f; + float step = ll == lll ? 0.f : 1.f / (ll - lll); + float element = 0.f; for (size_t j = lll; j <= ll; ++j) { filter_bank[i - 1][j] = element; element += step; } step = rr == rrr ? 0.f : 1.f / (rrr - rr); - element = 1.0f; + element = 1.f; for (size_t j = rr; j <= rrr; ++j) { filter_bank[i - 1][j] = element; element -= step; } for (size_t j = ll; j <= rr; ++j) { - filter_bank[i - 1][j] = 1.0f; + filter_bank[i - 1][j] = 1.f; } } - float sum; for (size_t i = 0; i < num_freqs; ++i) { - sum = 0.0f; + float sum = 0.f; for (size_t j = 0; j < bank_size_; ++j) { sum += filter_bank[j][i]; } @@ -329,22 +286,22 @@ std::vector> IntelligibilityEnhancer::CreateErbBank( void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols) { - bool quadratic = (kConfigRho < 1.0f); + bool quadratic = (kRho < 1.f); const float* pow_x0 = filtered_clear_pow_.get(); const float* pow_n0 = filtered_noise_pow_.get(); for (size_t n = 0; n < start_freq; ++n) { - sols[n] = 1.0f; + sols[n] = 1.f; } // Analytic solution for optimal gains. See paper for derivation. for (size_t n = start_freq - 1; n < bank_size_; ++n) { float alpha0, beta0, gamma0; - gamma0 = 0.5f * rho_[n] * pow_x0[n] * pow_n0[n] + + gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] + lambda * pow_x0[n] * pow_n0[n] * pow_n0[n]; - beta0 = lambda * pow_x0[n] * (2 - rho_[n]) * pow_x0[n] * pow_n0[n]; + beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n]; if (quadratic) { - alpha0 = lambda * pow_x0[n] * (1 - rho_[n]) * pow_x0[n] * pow_x0[n]; + alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n]; sols[n] = (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) / (2 * alpha0 + std::numeric_limits::epsilon()); @@ -355,8 +312,15 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, } } -bool IntelligibilityEnhancer::active() const { - return active_; +bool IntelligibilityEnhancer::IsSpeech(const float* audio) { + FloatToS16(audio, chunk_length_, &audio_s16_[0]); + vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); + if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { + chunks_since_voice_ = 0; + } else if (chunks_since_voice_ < kSpeechOffsetDelay) { + ++chunks_since_voice_; + } + return chunks_since_voice_ < kSpeechOffsetDelay; } } // namespace webrtc diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h index 2deb4d2439..c18bac0d85 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h @@ -18,6 +18,7 @@ #include "webrtc/common_audio/lapped_transform.h" #include "webrtc/common_audio/channel_buffer.h" #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" +#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" namespace webrtc { @@ -28,28 +29,7 @@ namespace webrtc { // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788 class IntelligibilityEnhancer { public: - struct Config { - // TODO(bercic): the |decay_rate|, |analysis_rate| and |gain_limit| - // parameters should probably go away once fine tuning is done. - Config() - : sample_rate_hz(16000), - num_capture_channels(1), - num_render_channels(1), - decay_rate(0.9f), - analysis_rate(60), - gain_change_limit(0.1f), - rho(0.02f) {} - int sample_rate_hz; - size_t num_capture_channels; - size_t num_render_channels; - float decay_rate; - int analysis_rate; - float gain_change_limit; - float rho; - }; - - explicit IntelligibilityEnhancer(const Config& config); - IntelligibilityEnhancer(); // Initialize with default config. + IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels); // Sets the capture noise magnitude spectrum estimate. void SetCaptureNoiseEstimate(std::vector noise); @@ -86,9 +66,6 @@ class IntelligibilityEnhancer { void ProcessClearBlock(const std::complex* in_block, std::complex* out_block); - // Computes and sets modified gains. - void AnalyzeClearBlock(); - // Bisection search for optimal |lambda|. void SolveForLambda(float power_target, float power_bot, float power_top); @@ -105,29 +82,25 @@ class IntelligibilityEnhancer { // Negative gains are set to 0. Stores the results in |sols|. void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols); + // Returns true if the audio is speech. + bool IsSpeech(const float* audio); + const size_t freqs_; // Num frequencies in frequency domain. - const size_t window_size_; // Window size in samples; also the block size. const size_t chunk_length_; // Chunk size in samples. const size_t bank_size_; // Num ERB filters. const int sample_rate_hz_; - const int erb_resolution_; - const size_t num_capture_channels_; const size_t num_render_channels_; - const int analysis_rate_; // Num blocks before gains recalculated. - const bool active_; // Whether render gains are being updated. - // TODO(ekm): Add logic for updating |active_|. - - intelligibility::PowerEstimator clear_power_; - std::vector noise_power_; + intelligibility::PowerEstimator> clear_power_estimator_; + std::unique_ptr> + noise_power_estimator_; std::unique_ptr filtered_clear_pow_; std::unique_ptr filtered_noise_pow_; std::unique_ptr center_freqs_; std::vector> capture_filter_bank_; std::vector> render_filter_bank_; size_t start_freq_; - std::unique_ptr rho_; // Production and interpretation SNR. - // for each ERB band. + std::unique_ptr gains_eq_; // Pre-filter modified gains. intelligibility::GainApplier gain_applier_; @@ -135,11 +108,13 @@ class IntelligibilityEnhancer { // the original input array with modifications. ChannelBuffer temp_render_out_buffer_; - std::unique_ptr kbd_window_; TransformCallback render_callback_; std::unique_ptr render_mangler_; - int block_count_; - int analysis_step_; + + VoiceActivityDetector vad_; + std::vector audio_s16_; + size_t chunks_since_voice_; + bool is_speech_; }; } // namespace webrtc diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc index b0f94ec5e2..b59ae36d8b 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc @@ -26,54 +26,184 @@ namespace { // Target output for ERB create test. Generated with matlab. const float kTestCenterFreqs[] = { - 13.169f, 26.965f, 41.423f, 56.577f, 72.461f, 89.113f, 106.57f, 124.88f, - 144.08f, 164.21f, 185.34f, 207.5f, 230.75f, 255.16f, 280.77f, 307.66f, - 335.9f, 365.56f, 396.71f, 429.44f, 463.84f, 500.f}; -const float kTestFilterBank[][9] = { - {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.5f}, - {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.5f}}; + 14.5213f, 29.735f, 45.6781f, 62.3884f, 79.9058f, 98.2691f, 117.521f, + 137.708f, 158.879f, 181.084f, 204.378f, 228.816f, 254.459f, 281.371f, + 309.618f, 339.273f, 370.411f, 403.115f, 437.469f, 473.564f, 511.497f, + 551.371f, 593.293f, 637.386f, 683.77f, 732.581f, 783.96f, 838.06f, + 895.046f, 955.09f, 1018.38f, 1085.13f, 1155.54f, 1229.85f, 1308.32f, + 1391.22f, 1478.83f, 1571.5f, 1669.55f, 1773.37f, 1883.37f, 2000.f}; +const float kTestFilterBank[][33] = { + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.157895f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.210526f, 0.117647f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.315789f, 0.176471f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.315789f, 0.352941f, 0.142857f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.352941f, 0.285714f, + 0.157895f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, + 0.210526f, 0.111111f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.285714f, 0.315789f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.315789f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, + 0.108108f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, + 0.243243f, 0.153846f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, + 0.324324f, 0.230769f, 0.166667f, 0.0909091f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.324324f, 0.307692f, 0.25f, 0.181818f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.307692f, 0.333333f, + 0.363636f, 0.25f, 0.151515f, 0.0793651f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.166667f, 0.363636f, 0.333333f, 0.242424f, + 0.190476f, 0.133333f, 0.0689655f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.30303f, 0.253968f, 0.2f, 0.137931f, + 0.0714286f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.30303f, 0.31746f, 0.333333f, 0.275862f, 0.214286f, + 0.125f, 0.0655738f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.15873f, 0.333333f, 0.344828f, 0.357143f, + 0.25f, 0.196721f, 0.137931f, 0.0816327f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.172414f, 0.357143f, + 0.3125f, 0.245902f, 0.172414f, 0.102041f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.3125f, 0.327869f, 0.344828f, 0.204082f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.163934f, 0.344828f, 0.408163f, 0.5f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.204082f, 0.5f}}; static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestFilterBank), "Test filterbank badly initialized."); // Target output for gain solving test. Generated with matlab. const size_t kTestStartFreq = 12; // Lowest integral frequency for ERBs. -const float kTestZeroVar[] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, - 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; +const float kTestZeroVar[] = { + 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0}; static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestZeroVar), "Power test data badly initialized."); const float kTestNonZeroVarLambdaTop[] = { - 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, - 1.f, 1.f, 1.f, 0.f, 0.f, 0.0351f, 0.0636f, 0.0863f, - 0.1037f, 0.1162f, 0.1236f, 0.1251f, 0.1189f, 0.0993f}; + 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0}; static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestNonZeroVarLambdaTop), "Power test data badly initialized."); const float kMaxTestError = 0.005f; // Enhancer initialization parameters. -const int kSamples = 2000; -const int kSampleRate = 1000; +const int kSamples = 1000; +const int kSampleRate = 4000; const int kNumChannels = 1; const int kFragmentSize = kSampleRate / 100; @@ -83,13 +213,11 @@ class IntelligibilityEnhancerTest : public ::testing::Test { protected: IntelligibilityEnhancerTest() : clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) { - config_.sample_rate_hz = kSampleRate; - enh_.reset(new IntelligibilityEnhancer(config_)); + enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels)); } bool CheckUpdate() { - config_.sample_rate_hz = kSampleRate; - enh_.reset(new IntelligibilityEnhancer(config_)); + enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels)); float* clear_cursor = &clear_data_[0]; float* noise_cursor = &noise_data_[0]; for (int i = 0; i < kSamples; i += kFragmentSize) { @@ -105,7 +233,6 @@ class IntelligibilityEnhancerTest : public ::testing::Test { return false; } - IntelligibilityEnhancer::Config config_; std::unique_ptr enh_; std::vector clear_data_; std::vector noise_data_; @@ -115,9 +242,9 @@ class IntelligibilityEnhancerTest : public ::testing::Test { // For each class of generated data, tests that render stream is updated when // it should be. TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) { - std::fill(noise_data_.begin(), noise_data_.end(), 0.0f); - std::fill(orig_data_.begin(), orig_data_.end(), 0.0f); - std::fill(clear_data_.begin(), clear_data_.end(), 0.0f); + std::fill(noise_data_.begin(), noise_data_.end(), 0.f); + std::fill(orig_data_.begin(), orig_data_.end(), 0.f); + std::fill(clear_data_.begin(), clear_data_.end(), 0.f); EXPECT_FALSE(CheckUpdate()); std::srand(1); auto float_rand = []() { return std::rand() * 2.f / RAND_MAX - 1; }; @@ -148,9 +275,8 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) { std::vector sols(enh_->bank_size_); float lambda = -0.001f; for (size_t i = 0; i < enh_->bank_size_; i++) { - enh_->filtered_clear_pow_[i] = 0.0f; - enh_->filtered_noise_pow_[i] = 0.0f; - enh_->rho_[i] = 0.02f; + enh_->filtered_clear_pow_[i] = 0.f; + enh_->filtered_noise_pow_[i] = 0.f; } enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]); for (size_t i = 0; i < enh_->bank_size_; i++) { @@ -164,7 +290,7 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) { for (size_t i = 0; i < enh_->bank_size_; i++) { EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError); } - lambda = -1.0; + lambda = -1.f; enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]); for (size_t i = 0; i < enh_->bank_size_; i++) { EXPECT_NEAR(kTestZeroVar[i], sols[i], kMaxTestError); diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc index 6c44415130..6d37199a2c 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc @@ -14,6 +14,7 @@ #include #include #include +#include namespace webrtc { @@ -21,45 +22,38 @@ namespace intelligibility { namespace { -// Return |current| changed towards |target|, with the change being at most -// |limit|. +// Return |current| changed towards |target|, with the relative change being at +// most |limit|. float UpdateFactor(float target, float current, float limit) { - float delta = fabsf(target - current); - float sign = copysign(1.f, target - current); - return current + sign * fminf(delta, limit); + float gain = target / (current + std::numeric_limits::epsilon()); + if (gain < 1.f - limit) { + gain = 1.f - limit; + } else if (gain > 1.f + limit) { + gain = 1.f + limit; + } + return current * gain + std::numeric_limits::epsilon(); } } // namespace -PowerEstimator::PowerEstimator(size_t num_freqs, - float decay) - : magnitude_(new float[num_freqs]()), - power_(new float[num_freqs]()), - num_freqs_(num_freqs), - decay_(decay) { - memset(magnitude_.get(), 0, sizeof(*magnitude_.get()) * num_freqs_); - memset(power_.get(), 0, sizeof(*power_.get()) * num_freqs_); -} +template +PowerEstimator::PowerEstimator(size_t num_freqs, float decay) + : power_(num_freqs, 0.f), decay_(decay) {} -// Compute the magnitude from the beginning, with exponential decaying of the -// series data. -void PowerEstimator::Step(const std::complex* data) { - for (size_t i = 0; i < num_freqs_; ++i) { - magnitude_[i] = decay_ * magnitude_[i] + - (1.f - decay_) * std::abs(data[i]); +template +void PowerEstimator::Step(const T* data) { + for (size_t i = 0; i < power_.size(); ++i) { + power_[i] = decay_ * power_[i] + + (1.f - decay_) * std::abs(data[i]) * std::abs(data[i]); } } -const float* PowerEstimator::Power() { - for (size_t i = 0; i < num_freqs_; ++i) { - power_[i] = magnitude_[i] * magnitude_[i]; - } - return &power_[0]; -} +template class PowerEstimator; +template class PowerEstimator>; -GainApplier::GainApplier(size_t freqs, float change_limit) +GainApplier::GainApplier(size_t freqs, float relative_change_limit) : num_freqs_(freqs), - change_limit_(change_limit), + relative_change_limit_(relative_change_limit), target_(new float[freqs]()), current_(new float[freqs]()) { for (size_t i = 0; i < freqs; ++i) { @@ -71,12 +65,8 @@ GainApplier::GainApplier(size_t freqs, float change_limit) void GainApplier::Apply(const std::complex* in_block, std::complex* out_block) { for (size_t i = 0; i < num_freqs_; ++i) { - float factor = sqrtf(fabsf(current_[i])); - if (!std::isnormal(factor)) { - factor = 1.f; - } - out_block[i] = factor * in_block[i]; - current_[i] = UpdateFactor(target_[i], current_[i], change_limit_); + current_[i] = UpdateFactor(target_[i], current_[i], relative_change_limit_); + out_block[i] = sqrtf(fabsf(current_[i])) * in_block[i]; } } diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h index 8858cff74c..3805a0cd15 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h @@ -13,6 +13,7 @@ #include #include +#include namespace webrtc { @@ -21,6 +22,7 @@ namespace intelligibility { // Internal helper for computing the power of a stream of arrays. // The result is an array of power per position: the i-th power is the power of // the stream of data on the i-th positions in the input arrays. +template class PowerEstimator { public: // Construct an instance for the given input array length (|freqs|), with the @@ -28,31 +30,24 @@ class PowerEstimator { PowerEstimator(size_t freqs, float decay); // Add a new data point to the series. - void Step(const std::complex* data); + void Step(const T* data); // The current power array. - const float* Power(); + const std::vector& power() { return power_; }; private: - // TODO(ekmeyerson): Switch the following running means - // and histories from std::unique_ptr to std::vector. - std::unique_ptr[]> running_mean_sq_; - - // The current magnitude array. - std::unique_ptr magnitude_; // The current power array. - std::unique_ptr power_; + std::vector power_; - const size_t num_freqs_; const float decay_; }; // Helper class for smoothing gain changes. On each application step, the // currently used gains are changed towards a set of settable target gains, -// constrained by a limit on the magnitude of the changes. +// constrained by a limit on the relative changes. class GainApplier { public: - GainApplier(size_t freqs, float change_limit); + GainApplier(size_t freqs, float relative_change_limit); // Copy |in_block| to |out_block|, multiplied by the current set of gains, // and step the current set of gains towards the target set. @@ -64,7 +59,7 @@ class GainApplier { private: const size_t num_freqs_; - const float change_limit_; + const float relative_change_limit_; std::unique_ptr target_; std::unique_ptr current_; }; diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc index 43ad9a7b1a..28957bb80d 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc @@ -39,17 +39,16 @@ TEST(IntelligibilityUtilsTest, TestPowerEstimator) { const float kDecay = 0.5f; const std::vector>> test_data( GenerateTestData(kFreqs, kSamples)); - PowerEstimator power_estimator(kFreqs, kDecay); - EXPECT_EQ(0, power_estimator.Power()[0]); + PowerEstimator> power_estimator(kFreqs, kDecay); + EXPECT_EQ(0, power_estimator.power()[0]); // Makes sure Step is doing something. power_estimator.Step(&test_data[0][0]); for (size_t i = 1; i < kSamples; ++i) { power_estimator.Step(&test_data[i][0]); for (size_t j = 0; j < kFreqs; ++j) { - const float* power = power_estimator.Power(); - EXPECT_GE(power[j], 0.f); - EXPECT_LE(power[j], 1.f); + EXPECT_GE(power_estimator.power()[j], 0.f); + EXPECT_LE(power_estimator.power()[j], 1.f); } } } @@ -62,8 +61,8 @@ TEST(IntelligibilityUtilsTest, TestGainApplier) { GainApplier gain_applier(kFreqs, kChangeLimit); const std::vector>> in_data( GenerateTestData(kFreqs, kSamples)); - std::vector>> out_data(GenerateTestData( - kFreqs, kSamples)); + std::vector>> out_data( + GenerateTestData(kFreqs, kSamples)); for (size_t i = 0; i < kSamples; ++i) { gain_applier.Apply(&in_data[i][0], &out_data[i][0]); for (size_t j = 0; j < kFreqs; ++j) { diff --git a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc index 1ec85f0abb..ab8524bb00 100644 --- a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc +++ b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc @@ -30,44 +30,24 @@ using std::complex; namespace webrtc { namespace { -DEFINE_double(clear_alpha, 0.9, "Power decay factor for clear data."); -DEFINE_int32(sample_rate, - 16000, - "Audio sample rate used in the input and output files."); -DEFINE_int32(ana_rate, - 60, - "Analysis rate; gains recalculated every N blocks."); -DEFINE_double(gain_limit, 1000.0, "Maximum gain change in one block."); - DEFINE_string(clear_file, "speech.wav", "Input file with clear speech."); DEFINE_string(noise_file, "noise.wav", "Input file with noise data."); DEFINE_string(out_file, "proc_enhanced.wav", "Enhanced output file."); -const size_t kNumChannels = 1; - // void function for gtest void void_main(int argc, char* argv[]) { google::SetUsageMessage( "\n\nInput files must be little-endian 16-bit signed raw PCM.\n"); google::ParseCommandLineFlags(&argc, &argv, true); - size_t samples; // Number of samples in input PCM file - size_t fragment_size; // Number of samples to process at a time - // to simulate APM stream processing - // Load settings and wav input. - - fragment_size = FLAGS_sample_rate / 100; // Mirror real time APM chunk size. - // Duplicates chunk_length_ in - // IntelligibilityEnhancer. - struct stat in_stat, noise_stat; ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0) << "Empty speech file."; ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0) << "Empty noise file."; - samples = std::min(in_stat.st_size, noise_stat.st_size) / 2; + const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2; WavReader in_file(FLAGS_clear_file); std::vector in_fpcm(samples); @@ -80,23 +60,19 @@ void void_main(int argc, char* argv[]) { FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]); // Run intelligibility enhancement. - IntelligibilityEnhancer::Config config; - config.sample_rate_hz = FLAGS_sample_rate; - config.decay_rate = static_cast(FLAGS_clear_alpha); - config.analysis_rate = FLAGS_ana_rate; - config.gain_change_limit = FLAGS_gain_limit; - IntelligibilityEnhancer enh(config); + IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels()); rtc::CriticalSection crit; NoiseSuppressionImpl ns(&crit); - ns.Initialize(kNumChannels, FLAGS_sample_rate); + ns.Initialize(noise_file.num_channels(), noise_file.sample_rate()); ns.Enable(true); - AudioBuffer capture_audio(fragment_size, - kNumChannels, - fragment_size, - kNumChannels, + // Mirror real time APM chunk size. Duplicates chunk_length_ in + // IntelligibilityEnhancer. + size_t fragment_size = in_file.sample_rate() / 100; + AudioBuffer capture_audio(fragment_size, noise_file.num_channels(), + fragment_size, noise_file.num_channels(), fragment_size); - StreamConfig stream_config(FLAGS_sample_rate, kNumChannels); + StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels()); // Slice the input into smaller chunks, as the APM would do, and feed them // through the enhancer. @@ -108,14 +84,17 @@ void void_main(int argc, char* argv[]) { ns.AnalyzeCaptureAudio(&capture_audio); ns.ProcessCaptureAudio(&capture_audio); enh.SetCaptureNoiseEstimate(ns.NoiseEstimate()); - enh.ProcessRenderAudio(&clear_cursor, FLAGS_sample_rate, kNumChannels); + enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(), + in_file.num_channels()); clear_cursor += fragment_size; noise_cursor += fragment_size; } FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]); - WavWriter out_file(FLAGS_out_file, FLAGS_sample_rate, kNumChannels); + WavWriter out_file(FLAGS_out_file, + in_file.sample_rate(), + in_file.num_channels()); out_file.WriteSamples(&in_fpcm[0], samples); } diff --git a/webrtc/modules/audio_processing/noise_suppression_impl.cc b/webrtc/modules/audio_processing/noise_suppression_impl.cc index 076f1ba25a..7f19005924 100644 --- a/webrtc/modules/audio_processing/noise_suppression_impl.cc +++ b/webrtc/modules/audio_processing/noise_suppression_impl.cc @@ -182,8 +182,8 @@ std::vector NoiseSuppressionImpl::NoiseEstimate() { for (auto& suppressor : suppressors_) { const float* noise = WebRtcNs_noise_estimate(suppressor->state()); for (size_t i = 0; i < noise_estimate.size(); ++i) { - noise_estimate[i] += kNormalizationFactor * - noise[i] / suppressors_.size(); + noise_estimate[i] += + kNormalizationFactor * noise[i] / suppressors_.size(); } } #elif defined(WEBRTC_NS_FIXED)