diff --git a/api/audio/echo_canceller3_config.cc b/api/audio/echo_canceller3_config.cc index e955b2fd95..3b03d13c1c 100644 --- a/api/audio/echo_canceller3_config.cc +++ b/api/audio/echo_canceller3_config.cc @@ -25,4 +25,30 @@ EchoCanceller3Config::EchoModel::EchoModel() = default; EchoCanceller3Config::EchoModel::EchoModel( const EchoCanceller3Config::EchoModel& e) = default; +EchoCanceller3Config::Suppressor::Suppressor() = default; +EchoCanceller3Config::Suppressor::Suppressor( + const EchoCanceller3Config::Suppressor& e) = default; + +EchoCanceller3Config::Suppressor::MaskingThresholds::MaskingThresholds( + float enr_transparent, + float enr_suppress, + float emr_transparent) + : enr_transparent(enr_transparent), + enr_suppress(enr_suppress), + emr_transparent(emr_transparent) {} +EchoCanceller3Config::Suppressor::Suppressor::MaskingThresholds:: + MaskingThresholds( + const EchoCanceller3Config::Suppressor::MaskingThresholds& e) = default; + +EchoCanceller3Config::Suppressor::Tuning::Tuning(MaskingThresholds mask_lf, + MaskingThresholds mask_hf, + float max_inc_factor, + float max_dec_factor_lf) + : mask_lf(mask_lf), + mask_hf(mask_hf), + max_inc_factor(max_inc_factor), + max_dec_factor_lf(max_dec_factor_lf) {} +EchoCanceller3Config::Suppressor::Tuning::Tuning( + const EchoCanceller3Config::Suppressor::Tuning& e) = default; + } // namespace webrtc diff --git a/api/audio/echo_canceller3_config.h b/api/audio/echo_canceller3_config.h index 79d87a08f8..9a1510b138 100644 --- a/api/audio/echo_canceller3_config.h +++ b/api/audio/echo_canceller3_config.h @@ -112,12 +112,6 @@ struct EchoCanceller3Config { float poor_excitation_render_limit_ds8 = 20.f; } render_levels; - struct GainUpdates { - float max_inc_factor = 2.0f; - float max_dec_factor_lf = 0.25f; - float floor_first_increase = 0.00001f; - } gain_updates; - struct EchoRemovalControl { struct GainRampup { float initial_gain = 0.0f; @@ -146,15 +140,50 @@ struct EchoCanceller3Config { } echo_model; struct Suppressor { + Suppressor(); + Suppressor(const Suppressor& e); + size_t nearend_average_blocks = 4; struct MaskingThresholds { + MaskingThresholds(float enr_transparent, + float enr_suppress, + float emr_transparent); + MaskingThresholds(const MaskingThresholds& e); float enr_transparent; float enr_suppress; float emr_transparent; }; - MaskingThresholds mask_lf = {.2f, .3f, .3f}; - MaskingThresholds mask_hf = {.07f, .1f, .3f}; + + struct Tuning { + Tuning(MaskingThresholds mask_lf, + MaskingThresholds mask_hf, + float max_inc_factor, + float max_dec_factor_lf); + Tuning(const Tuning& e); + MaskingThresholds mask_lf; + MaskingThresholds mask_hf; + float max_inc_factor; + float max_dec_factor_lf; + }; + + Tuning normal_tuning = Tuning(MaskingThresholds(.2f, .3f, .3f), + MaskingThresholds(.07f, .1f, .3f), + 2.0f, + 0.25f); + Tuning nearend_tuning = Tuning(MaskingThresholds(.2f, .3f, .3f), + MaskingThresholds(.07f, .1f, .3f), + 2.0f, + 0.25f); + + struct DominantNearendDetection { + float enr_threshold = 10.f; + float snr_threshold = 10.f; + int hold_duration = 25; + int trigger_threshold = 15; + } dominant_nearend_detection; + + float floor_first_increase = 0.00001f; bool enforce_transparent = false; bool enforce_empty_higher_bands = false; } suppressor; diff --git a/modules/audio_processing/aec3/suppression_gain.cc b/modules/audio_processing/aec3/suppression_gain.cc index b442e04664..2d88ab2863 100644 --- a/modules/audio_processing/aec3/suppression_gain.cc +++ b/modules/audio_processing/aec3/suppression_gain.cc @@ -1,4 +1,3 @@ - /* * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. * @@ -227,13 +226,16 @@ void SuppressionGain::GainToNoAudibleEcho( const std::array& min_gain, const std::array& max_gain, std::array* gain) const { + const auto& p = dominant_nearend_detector_.IsNearendState() ? nearend_params_ + : normal_params_; for (size_t k = 0; k < gain->size(); ++k) { float enr = echo[k] / (nearend[k] + 1.f); // Echo-to-nearend ratio. float emr = echo[k] / (masker[k] + 1.f); // Echo-to-masker (noise) ratio. float g = 1.0f; - if (enr > enr_transparent_[k] && emr > emr_transparent_[k]) { - g = (enr_suppress_[k] - enr) / (enr_suppress_[k] - enr_transparent_[k]); - g = std::max(g, emr_transparent_[k] / emr); + if (enr > p.enr_transparent_[k] && emr > p.emr_transparent_[k]) { + g = (p.enr_suppress_[k] - enr) / + (p.enr_suppress_[k] - p.enr_transparent_[k]); + g = std::max(g, p.emr_transparent_[k] / emr); } (*gain)[k] = std::max(std::min(g, max_gain[k]), min_gain[k]); } @@ -249,6 +251,9 @@ void SuppressionGain::LowerBandGain( std::array* gain) { const bool saturated_echo = aec_state.SaturatedEcho(); const bool linear_echo_estimate = aec_state.UsableLinearEstimate(); + const auto& params = dominant_nearend_detector_.IsNearendState() + ? nearend_params_ + : normal_params_; // Weight echo power in terms of audibility. // Precompute 1/weighted echo // (note that when the echo is zero, the precomputed value is never used). @@ -273,8 +278,7 @@ void SuppressionGain::LowerBandGain( // quickly after strong nearend. if (last_nearend_[k] > last_echo_[k]) { min_gain[k] = - std::max(min_gain[k], - last_gain_[k] * config_.gain_updates.max_dec_factor_lf); + std::max(min_gain[k], last_gain_[k] * params.max_dec_factor_lf); min_gain[k] = std::min(min_gain[k], 1.f); } } @@ -286,10 +290,9 @@ void SuppressionGain::LowerBandGain( // gain. std::array max_gain; for (size_t k = 0; k < gain->size(); ++k) { - max_gain[k] = - std::min(std::max(last_gain_[k] * config_.gain_updates.max_inc_factor, - config_.gain_updates.floor_first_increase), - 1.f); + max_gain[k] = std::min(std::max(last_gain_[k] * params.max_inc_factor, + config_.suppressor.floor_first_increase), + 1.f); } // Iteratively compute the gain required to attenuate the echo to a non @@ -337,34 +340,16 @@ SuppressionGain::SuppressionGain(const EchoCanceller3Config& config, static_cast(config_.filter.config_change_duration_blocks)), enable_new_suppression_(EnableNewSuppression()), moving_average_(kFftLengthBy2Plus1, - config.suppressor.nearend_average_blocks) { + config.suppressor.nearend_average_blocks), + nearend_params_(config_.suppressor.nearend_tuning), + normal_params_(config_.suppressor.normal_tuning), + dominant_nearend_detector_( + config_.suppressor.dominant_nearend_detection) { RTC_DCHECK_LT(0, state_change_duration_blocks_); one_by_state_change_duration_blocks_ = 1.f / state_change_duration_blocks_; last_gain_.fill(1.f); last_nearend_.fill(0.f); last_echo_.fill(0.f); - - // Compute per-band masking thresholds. - constexpr size_t kLastLfBand = 5; - constexpr size_t kFirstHfBand = 8; - RTC_DCHECK_LT(kLastLfBand, kFirstHfBand); - auto& lf = config.suppressor.mask_lf; - auto& hf = config.suppressor.mask_hf; - RTC_DCHECK_LT(lf.enr_transparent, lf.enr_suppress); - RTC_DCHECK_LT(hf.enr_transparent, hf.enr_suppress); - for (size_t k = 0; k < kFftLengthBy2Plus1; k++) { - float a; - if (k <= kLastLfBand) { - a = 0.f; - } else if (k < kFirstHfBand) { - a = (k - kLastLfBand) / static_cast(kFirstHfBand - kLastLfBand); - } else { - a = 1.f; - } - enr_transparent_[k] = (1 - a) * lf.enr_transparent + a * hf.enr_transparent; - enr_suppress_[k] = (1 - a) * lf.enr_suppress + a * hf.enr_suppress; - emr_transparent_[k] = (1 - a) * lf.emr_transparent + a * hf.emr_transparent; - } } SuppressionGain::~SuppressionGain() = default; @@ -393,6 +378,10 @@ void SuppressionGain::GetGain( std::array nearend_average; moving_average_.Average(nearend_spectrum, nearend_average); + // Update the state selection. + dominant_nearend_detector_.Update(nearend_spectrum, echo_spectrum, + comfort_noise_spectrum); + // Compute gain for the lower band. bool low_noise_render = low_render_detector_.Detect(render); const absl::optional narrow_peak_band = @@ -444,4 +433,69 @@ bool SuppressionGain::LowNoiseRenderDetector::Detect( return low_noise_render; } +SuppressionGain::DominantNearendDetector::DominantNearendDetector( + const EchoCanceller3Config::Suppressor::DominantNearendDetection config) + : enr_threshold_(config.enr_threshold), + snr_threshold_(config.snr_threshold), + hold_duration_(config.hold_duration), + trigger_threshold_(config.trigger_threshold) {} + +void SuppressionGain::DominantNearendDetector::Update( + rtc::ArrayView nearend_spectrum, + rtc::ArrayView echo_spectrum, + rtc::ArrayView comfort_noise_spectrum) { + auto low_frequency_energy = [](rtc::ArrayView spectrum) { + RTC_DCHECK_LE(16, spectrum.size()); + return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f); + }; + const float ne_sum = low_frequency_energy(nearend_spectrum); + const float echo_sum = low_frequency_energy(echo_spectrum); + const float noise_sum = low_frequency_energy(comfort_noise_spectrum); + + // Detect strong active nearend if the nearend is sufficiently stronger than + // the echo and the nearend noise. + if (ne_sum > enr_threshold_ * echo_sum && + ne_sum > snr_threshold_ * noise_sum) { + if (++trigger_counter_ >= trigger_threshold_) { + // After a period of strong active nearend activity, flag nearend mode. + hold_counter_ = hold_duration_; + trigger_counter_ = trigger_threshold_; + } + } else { + // Forget previously detected strong active nearend activity. + trigger_counter_ = std::max(0, trigger_counter_ - 1); + } + + // Remain in any nearend mode for a certain duration. + hold_counter_ = std::max(0, hold_counter_ - 1); + nearend_state_ = hold_counter_ > 0; +} + +SuppressionGain::GainParameters::GainParameters( + const EchoCanceller3Config::Suppressor::Tuning& tuning) + : max_inc_factor(tuning.max_inc_factor), + max_dec_factor_lf(tuning.max_dec_factor_lf) { + // Compute per-band masking thresholds. + constexpr size_t kLastLfBand = 5; + constexpr size_t kFirstHfBand = 8; + RTC_DCHECK_LT(kLastLfBand, kFirstHfBand); + auto& lf = tuning.mask_lf; + auto& hf = tuning.mask_hf; + RTC_DCHECK_LT(lf.enr_transparent, lf.enr_suppress); + RTC_DCHECK_LT(hf.enr_transparent, hf.enr_suppress); + for (size_t k = 0; k < kFftLengthBy2Plus1; k++) { + float a; + if (k <= kLastLfBand) { + a = 0.f; + } else if (k < kFirstHfBand) { + a = (k - kLastLfBand) / static_cast(kFirstHfBand - kLastLfBand); + } else { + a = 1.f; + } + enr_transparent_[k] = (1 - a) * lf.enr_transparent + a * hf.enr_transparent; + enr_suppress_[k] = (1 - a) * lf.enr_suppress + a * hf.enr_suppress; + emr_transparent_[k] = (1 - a) * lf.emr_transparent + a * hf.emr_transparent; + } +} + } // namespace webrtc diff --git a/modules/audio_processing/aec3/suppression_gain.h b/modules/audio_processing/aec3/suppression_gain.h index aa38e547aa..3753711533 100644 --- a/modules/audio_processing/aec3/suppression_gain.h +++ b/modules/audio_processing/aec3/suppression_gain.h @@ -68,6 +68,42 @@ class SuppressionGain { float average_power_ = 32768.f * 32768.f; }; + // Class for selecting whether the suppressor is in the nearend or echo state. + class DominantNearendDetector { + public: + explicit DominantNearendDetector( + const EchoCanceller3Config::Suppressor::DominantNearendDetection + config); + + // Returns whether the current state is the nearend state. + bool IsNearendState() const { return nearend_state_; } + + // Updates the state selection based on latest spectral estimates. + void Update(rtc::ArrayView nearend_spectrum, + rtc::ArrayView echo_spectrum, + rtc::ArrayView comfort_noise_spectrum); + + private: + const float enr_threshold_; + const float snr_threshold_; + const int hold_duration_; + const int trigger_threshold_; + + bool nearend_state_ = false; + int trigger_counter_ = 0; + int hold_counter_ = 0; + }; + + struct GainParameters { + explicit GainParameters( + const EchoCanceller3Config::Suppressor::Tuning& tuning); + const float max_inc_factor; + const float max_dec_factor_lf; + std::array enr_transparent_; + std::array enr_suppress_; + std::array emr_transparent_; + }; + static int instance_count_; std::unique_ptr data_dumper_; const Aec3Optimization optimization_; @@ -77,14 +113,14 @@ class SuppressionGain { std::array last_gain_; std::array last_nearend_; std::array last_echo_; - std::array enr_transparent_; - std::array enr_suppress_; - std::array emr_transparent_; LowNoiseRenderDetector low_render_detector_; bool initial_state_ = true; int initial_state_change_counter_ = 0; const bool enable_new_suppression_; aec3::MovingAverage moving_average_; + const GainParameters nearend_params_; + const GainParameters normal_params_; + DominantNearendDetector dominant_nearend_detector_; RTC_DISALLOW_COPY_AND_ASSIGN(SuppressionGain); }; diff --git a/modules/audio_processing/test/audio_processing_simulator.cc b/modules/audio_processing/test/audio_processing_simulator.cc index 2500a967c3..9fea4080cd 100644 --- a/modules/audio_processing/test/audio_processing_simulator.cc +++ b/modules/audio_processing/test/audio_processing_simulator.cc @@ -290,14 +290,6 @@ class Aec3ParametersParser { &cfg.echo_audibility.use_stationary_properties); } - if (rtc::GetValueFromJsonObject(root, "gain_updates", §ion)) { - ReadParam(section, "max_inc_factor", &cfg.gain_updates.max_inc_factor); - ReadParam(section, "max_dec_factor_lf", - &cfg.gain_updates.max_dec_factor_lf); - ReadParam(section, "floor_first_increase", - &cfg.gain_updates.floor_first_increase); - } - if (rtc::GetValueFromJsonObject(root, "echo_removal_control", §ion)) { Json::Value subsection; if (rtc::GetValueFromJsonObject(section, "gain_rampup", &subsection)) { @@ -338,11 +330,45 @@ class Aec3ParametersParser { &cfg.echo_model.nonlinear_release); } + Json::Value subsection; if (rtc::GetValueFromJsonObject(root, "suppressor", §ion)) { ReadParam(section, "nearend_average_blocks", &cfg.suppressor.nearend_average_blocks); - ReadParam(section, "mask_lf", &cfg.suppressor.mask_lf); - ReadParam(section, "mask_hf", &cfg.suppressor.mask_hf); + + if (rtc::GetValueFromJsonObject(section, "normal_tuning", &subsection)) { + ReadParam(subsection, "mask_lf", &cfg.suppressor.normal_tuning.mask_lf); + ReadParam(subsection, "mask_hf", &cfg.suppressor.normal_tuning.mask_hf); + ReadParam(subsection, "max_inc_factor", + &cfg.suppressor.normal_tuning.max_inc_factor); + ReadParam(subsection, "max_dec_factor_lf", + &cfg.suppressor.normal_tuning.max_dec_factor_lf); + } + + if (rtc::GetValueFromJsonObject(section, "nearend_tuning", &subsection)) { + ReadParam(subsection, "mask_lf", + &cfg.suppressor.nearend_tuning.mask_lf); + ReadParam(subsection, "mask_hf", + &cfg.suppressor.nearend_tuning.mask_hf); + ReadParam(subsection, "max_inc_factor", + &cfg.suppressor.nearend_tuning.max_inc_factor); + ReadParam(subsection, "max_dec_factor_lf", + &cfg.suppressor.nearend_tuning.max_dec_factor_lf); + } + + if (rtc::GetValueFromJsonObject(section, "dominant_nearend_detection", + &subsection)) { + ReadParam(subsection, "enr_threshold", + &cfg.suppressor.dominant_nearend_detection.enr_threshold); + ReadParam(subsection, "snr_threshold", + &cfg.suppressor.dominant_nearend_detection.snr_threshold); + ReadParam(subsection, "hold_duration", + &cfg.suppressor.dominant_nearend_detection.hold_duration); + ReadParam(subsection, "trigger_threshold", + &cfg.suppressor.dominant_nearend_detection.trigger_threshold); + } + + ReadParam(section, "floor_first_increase", + &cfg.suppressor.floor_first_increase); ReadParam(section, "enforce_transparent", &cfg.suppressor.enforce_transparent); ReadParam(section, "enforce_empty_higher_bands",