
Multi-channel behaviors introduced in this CL: - All filters are analyzed independently. The filtering is considered consistent if any filter is consistent. - The filter echo path gain used to detect saturation is maxed across capture channels. - The filter delay is taken to be the minimum of all filters: Any module that looks in the render data starting from the filter delay will iterate over all render audio present in any channel. - The FilterAnalyzer will consider a render block to be active if any render channel has activity. The changes in the CL has been shown to be bitexact on a large set of mono recordings. Bug: webrtc:10913 Change-Id: I1e360cd7136ee82d1f6e0f8a1459806e83f4426d Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/155363 Reviewed-by: Per Åhgren <peah@webrtc.org> Commit-Queue: Sam Zackrisson <saza@webrtc.org> Cr-Commit-Position: refs/heads/master@{#29408}
482 lines
18 KiB
C++
482 lines
18 KiB
C++
/*
|
|
* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "modules/audio_processing/aec3/aec_state.h"
|
|
|
|
#include <math.h>
|
|
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
#include <vector>
|
|
|
|
#include "absl/types/optional.h"
|
|
#include "api/array_view.h"
|
|
#include "modules/audio_processing/aec3/aec3_common.h"
|
|
#include "modules/audio_processing/logging/apm_data_dumper.h"
|
|
#include "rtc_base/atomic_ops.h"
|
|
#include "rtc_base/checks.h"
|
|
|
|
namespace webrtc {
|
|
namespace {
|
|
|
|
constexpr size_t kBlocksSinceConvergencedFilterInit = 10000;
|
|
constexpr size_t kBlocksSinceConsistentEstimateInit = 10000;
|
|
|
|
} // namespace
|
|
|
|
int AecState::instance_count_ = 0;
|
|
|
|
void AecState::GetResidualEchoScaling(
|
|
rtc::ArrayView<float> residual_scaling) const {
|
|
bool filter_has_had_time_to_converge;
|
|
if (config_.filter.conservative_initial_phase) {
|
|
filter_has_had_time_to_converge =
|
|
strong_not_saturated_render_blocks_ >= 1.5f * kNumBlocksPerSecond;
|
|
} else {
|
|
filter_has_had_time_to_converge =
|
|
strong_not_saturated_render_blocks_ >= 0.8f * kNumBlocksPerSecond;
|
|
}
|
|
echo_audibility_.GetResidualEchoScaling(filter_has_had_time_to_converge,
|
|
residual_scaling);
|
|
}
|
|
|
|
absl::optional<float> AecState::ErleUncertainty() const {
|
|
if (SaturatedEcho()) {
|
|
return 1.f;
|
|
}
|
|
|
|
return absl::nullopt;
|
|
}
|
|
|
|
AecState::AecState(const EchoCanceller3Config& config,
|
|
size_t num_capture_channels)
|
|
: data_dumper_(
|
|
new ApmDataDumper(rtc::AtomicOps::Increment(&instance_count_))),
|
|
config_(config),
|
|
initial_state_(config_),
|
|
delay_state_(config_),
|
|
transparent_state_(config_),
|
|
filter_quality_state_(config_),
|
|
erl_estimator_(2 * kNumBlocksPerSecond),
|
|
erle_estimator_(2 * kNumBlocksPerSecond, config_, num_capture_channels),
|
|
max_echo_path_gain_(config_.ep_strength.default_gain),
|
|
filter_analyzers_(num_capture_channels),
|
|
echo_audibility_(
|
|
config_.echo_audibility.use_stationarity_properties_at_init),
|
|
reverb_model_estimator_(config_),
|
|
subtractor_output_analyzers_(num_capture_channels) {
|
|
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
|
|
filter_analyzers_[ch] = std::make_unique<FilterAnalyzer>(config_);
|
|
}
|
|
}
|
|
|
|
AecState::~AecState() = default;
|
|
|
|
void AecState::HandleEchoPathChange(
|
|
const EchoPathVariability& echo_path_variability) {
|
|
const auto full_reset = [&]() {
|
|
for (auto& filter_analyzer : filter_analyzers_) {
|
|
filter_analyzer->Reset();
|
|
}
|
|
max_echo_path_gain_ = config_.ep_strength.default_gain;
|
|
capture_signal_saturation_ = false;
|
|
strong_not_saturated_render_blocks_ = 0;
|
|
blocks_with_active_render_ = 0;
|
|
initial_state_.Reset();
|
|
transparent_state_.Reset();
|
|
erle_estimator_.Reset(true);
|
|
erl_estimator_.Reset();
|
|
filter_quality_state_.Reset();
|
|
};
|
|
|
|
// TODO(peah): Refine the reset scheme according to the type of gain and
|
|
// delay adjustment.
|
|
|
|
if (echo_path_variability.delay_change !=
|
|
EchoPathVariability::DelayAdjustment::kNone) {
|
|
full_reset();
|
|
} else if (echo_path_variability.gain_change) {
|
|
erle_estimator_.Reset(false);
|
|
}
|
|
for (auto& analyzer : subtractor_output_analyzers_) {
|
|
analyzer.HandleEchoPathChange();
|
|
}
|
|
}
|
|
|
|
void AecState::Update(
|
|
const absl::optional<DelayEstimate>& external_delay,
|
|
rtc::ArrayView<const std::vector<std::array<float, kFftLengthBy2Plus1>>>
|
|
adaptive_filter_frequency_response,
|
|
rtc::ArrayView<const std::vector<float>> adaptive_filter_impulse_response,
|
|
const RenderBuffer& render_buffer,
|
|
const std::array<float, kFftLengthBy2Plus1>& E2_main,
|
|
const std::array<float, kFftLengthBy2Plus1>& Y2,
|
|
rtc::ArrayView<const SubtractorOutput> subtractor_output) {
|
|
const size_t num_capture_channels = filter_analyzers_.size();
|
|
RTC_DCHECK_EQ(num_capture_channels, subtractor_output.size());
|
|
RTC_DCHECK_EQ(num_capture_channels, subtractor_output_analyzers_.size());
|
|
RTC_DCHECK_EQ(num_capture_channels,
|
|
adaptive_filter_frequency_response.size());
|
|
RTC_DCHECK_EQ(num_capture_channels, adaptive_filter_impulse_response.size());
|
|
|
|
// Analyze the filter outputs and filters.
|
|
bool any_filter_converged = false;
|
|
bool all_filters_diverged = true;
|
|
bool any_filter_consistent = false;
|
|
max_echo_path_gain_ = 0.f;
|
|
for (size_t ch = 0; ch < subtractor_output.size(); ++ch) {
|
|
subtractor_output_analyzers_[ch].Update(subtractor_output[ch]);
|
|
any_filter_converged = any_filter_converged ||
|
|
subtractor_output_analyzers_[ch].ConvergedFilter();
|
|
all_filters_diverged = all_filters_diverged &&
|
|
subtractor_output_analyzers_[ch].DivergedFilter();
|
|
|
|
filter_analyzers_[ch]->Update(adaptive_filter_impulse_response[ch],
|
|
render_buffer);
|
|
any_filter_consistent =
|
|
any_filter_consistent || filter_analyzers_[ch]->Consistent();
|
|
max_echo_path_gain_ =
|
|
std::max(max_echo_path_gain_, filter_analyzers_[ch]->Gain());
|
|
}
|
|
|
|
// Estimate the direct path delay of the filter.
|
|
if (config_.filter.use_linear_filter) {
|
|
delay_state_.Update(filter_analyzers_, external_delay,
|
|
strong_not_saturated_render_blocks_);
|
|
}
|
|
|
|
const std::vector<std::vector<float>>& aligned_render_block =
|
|
render_buffer.Block(-delay_state_.DirectPathFilterDelay())[0];
|
|
|
|
// Update render counters.
|
|
bool active_render = false;
|
|
for (size_t ch = 0; ch < aligned_render_block.size(); ++ch) {
|
|
const float render_energy = std::inner_product(
|
|
aligned_render_block[ch].begin(), aligned_render_block[ch].end(),
|
|
aligned_render_block[ch].begin(), 0.f);
|
|
if (render_energy > (config_.render_levels.active_render_limit *
|
|
config_.render_levels.active_render_limit) *
|
|
kFftLengthBy2) {
|
|
active_render = true;
|
|
break;
|
|
}
|
|
}
|
|
blocks_with_active_render_ += active_render ? 1 : 0;
|
|
strong_not_saturated_render_blocks_ +=
|
|
active_render && !SaturatedCapture() ? 1 : 0;
|
|
|
|
std::array<float, kFftLengthBy2Plus1> X2_reverb;
|
|
render_reverb_.Apply(render_buffer.GetSpectrumBuffer(),
|
|
delay_state_.DirectPathFilterDelay(), ReverbDecay(),
|
|
X2_reverb);
|
|
|
|
if (config_.echo_audibility.use_stationarity_properties) {
|
|
// Update the echo audibility evaluator.
|
|
echo_audibility_.Update(render_buffer,
|
|
render_reverb_.GetReverbContributionPowerSpectrum(),
|
|
delay_state_.DirectPathFilterDelay(),
|
|
delay_state_.ExternalDelayReported());
|
|
}
|
|
|
|
// Update the ERL and ERLE measures.
|
|
if (initial_state_.TransitionTriggered()) {
|
|
erle_estimator_.Reset(false);
|
|
}
|
|
|
|
// TODO(bugs.webrtc.org/10913): Take all channels into account.
|
|
const auto& X2 = render_buffer.Spectrum(delay_state_.DirectPathFilterDelay(),
|
|
/*channel=*/0);
|
|
const auto& X2_input_erle = X2_reverb;
|
|
|
|
erle_estimator_.Update(render_buffer, adaptive_filter_frequency_response[0],
|
|
X2_input_erle, Y2, E2_main,
|
|
subtractor_output_analyzers_[0].ConvergedFilter(),
|
|
config_.erle.onset_detection);
|
|
|
|
erl_estimator_.Update(subtractor_output_analyzers_[0].ConvergedFilter(), X2,
|
|
Y2);
|
|
|
|
// Detect and flag echo saturation.
|
|
saturation_detector_.Update(aligned_render_block, SaturatedCapture(),
|
|
UsableLinearEstimate(), subtractor_output,
|
|
EchoPathGain());
|
|
|
|
// Update the decision on whether to use the initial state parameter set.
|
|
initial_state_.Update(active_render, SaturatedCapture());
|
|
|
|
// Detect whether the transparent mode should be activated.
|
|
transparent_state_.Update(delay_state_.DirectPathFilterDelay(),
|
|
any_filter_consistent, any_filter_converged,
|
|
all_filters_diverged, active_render,
|
|
SaturatedCapture());
|
|
|
|
// Analyze the quality of the filter.
|
|
filter_quality_state_.Update(active_render, TransparentMode(),
|
|
SaturatedCapture(), external_delay,
|
|
any_filter_converged);
|
|
|
|
// Update the reverb estimate.
|
|
const bool stationary_block =
|
|
config_.echo_audibility.use_stationarity_properties &&
|
|
echo_audibility_.IsBlockStationary();
|
|
|
|
reverb_model_estimator_.Update(filter_analyzers_[0]->GetAdjustedFilter(),
|
|
adaptive_filter_frequency_response[0],
|
|
erle_estimator_.GetInstLinearQualityEstimate(),
|
|
delay_state_.DirectPathFilterDelay(),
|
|
UsableLinearEstimate(), stationary_block);
|
|
|
|
erle_estimator_.Dump(data_dumper_);
|
|
reverb_model_estimator_.Dump(data_dumper_.get());
|
|
data_dumper_->DumpRaw("aec3_erl", Erl());
|
|
data_dumper_->DumpRaw("aec3_erl_time_domain", ErlTimeDomain());
|
|
data_dumper_->DumpRaw("aec3_erle", Erle()[0]);
|
|
data_dumper_->DumpRaw("aec3_usable_linear_estimate", UsableLinearEstimate());
|
|
data_dumper_->DumpRaw("aec3_transparent_mode", TransparentMode());
|
|
data_dumper_->DumpRaw("aec3_filter_delay",
|
|
filter_analyzers_[0]->DelayBlocks());
|
|
|
|
data_dumper_->DumpRaw("aec3_any_filter_consistent", any_filter_consistent);
|
|
data_dumper_->DumpRaw("aec3_initial_state",
|
|
initial_state_.InitialStateActive());
|
|
data_dumper_->DumpRaw("aec3_capture_saturation", SaturatedCapture());
|
|
data_dumper_->DumpRaw("aec3_echo_saturation", SaturatedEcho());
|
|
data_dumper_->DumpRaw("aec3_any_filter_converged", any_filter_converged);
|
|
data_dumper_->DumpRaw("aec3_all_filters_diverged", all_filters_diverged);
|
|
|
|
data_dumper_->DumpRaw("aec3_external_delay_avaliable",
|
|
external_delay ? 1 : 0);
|
|
data_dumper_->DumpRaw("aec3_filter_tail_freq_resp_est",
|
|
GetReverbFrequencyResponse());
|
|
}
|
|
|
|
AecState::InitialState::InitialState(const EchoCanceller3Config& config)
|
|
: conservative_initial_phase_(config.filter.conservative_initial_phase),
|
|
initial_state_seconds_(config.filter.initial_state_seconds) {
|
|
Reset();
|
|
}
|
|
void AecState::InitialState::InitialState::Reset() {
|
|
initial_state_ = true;
|
|
strong_not_saturated_render_blocks_ = 0;
|
|
}
|
|
void AecState::InitialState::InitialState::Update(bool active_render,
|
|
bool saturated_capture) {
|
|
strong_not_saturated_render_blocks_ +=
|
|
active_render && !saturated_capture ? 1 : 0;
|
|
|
|
// Flag whether the initial state is still active.
|
|
bool prev_initial_state = initial_state_;
|
|
if (conservative_initial_phase_) {
|
|
initial_state_ =
|
|
strong_not_saturated_render_blocks_ < 5 * kNumBlocksPerSecond;
|
|
} else {
|
|
initial_state_ = strong_not_saturated_render_blocks_ <
|
|
initial_state_seconds_ * kNumBlocksPerSecond;
|
|
}
|
|
|
|
// Flag whether the transition from the initial state has started.
|
|
transition_triggered_ = !initial_state_ && prev_initial_state;
|
|
}
|
|
|
|
AecState::FilterDelay::FilterDelay(const EchoCanceller3Config& config)
|
|
: delay_headroom_samples_(config.delay.delay_headroom_samples) {}
|
|
|
|
void AecState::FilterDelay::Update(
|
|
const std::vector<std::unique_ptr<FilterAnalyzer>>& filter_analyzers,
|
|
const absl::optional<DelayEstimate>& external_delay,
|
|
size_t blocks_with_proper_filter_adaptation) {
|
|
// Update the delay based on the external delay.
|
|
if (external_delay &&
|
|
(!external_delay_ || external_delay_->delay != external_delay->delay)) {
|
|
external_delay_ = external_delay;
|
|
external_delay_reported_ = true;
|
|
}
|
|
|
|
// Override the estimated delay if it is not certain that the filter has had
|
|
// time to converge.
|
|
const bool delay_estimator_may_not_have_converged =
|
|
blocks_with_proper_filter_adaptation < 2 * kNumBlocksPerSecond;
|
|
if (delay_estimator_may_not_have_converged && external_delay_) {
|
|
filter_delay_blocks_ = delay_headroom_samples_ / kBlockSize;
|
|
} else {
|
|
// Conservatively use the min delay among the filters.
|
|
filter_delay_blocks_ = filter_analyzers[0]->DelayBlocks();
|
|
for (size_t ch = 1; ch < filter_analyzers.size(); ++ch) {
|
|
filter_delay_blocks_ =
|
|
std::min(filter_delay_blocks_, filter_analyzers[ch]->DelayBlocks());
|
|
}
|
|
}
|
|
}
|
|
|
|
AecState::TransparentMode::TransparentMode(const EchoCanceller3Config& config)
|
|
: bounded_erl_(config.ep_strength.bounded_erl),
|
|
linear_and_stable_echo_path_(
|
|
config.echo_removal_control.linear_and_stable_echo_path),
|
|
active_blocks_since_sane_filter_(kBlocksSinceConsistentEstimateInit),
|
|
non_converged_sequence_size_(kBlocksSinceConvergencedFilterInit) {}
|
|
|
|
void AecState::TransparentMode::Reset() {
|
|
non_converged_sequence_size_ = kBlocksSinceConvergencedFilterInit;
|
|
diverged_sequence_size_ = 0;
|
|
strong_not_saturated_render_blocks_ = 0;
|
|
if (linear_and_stable_echo_path_) {
|
|
recent_convergence_during_activity_ = false;
|
|
}
|
|
}
|
|
|
|
void AecState::TransparentMode::Update(int filter_delay_blocks,
|
|
bool any_filter_consistent,
|
|
bool any_filter_converged,
|
|
bool all_filters_diverged,
|
|
bool active_render,
|
|
bool saturated_capture) {
|
|
++capture_block_counter_;
|
|
strong_not_saturated_render_blocks_ +=
|
|
active_render && !saturated_capture ? 1 : 0;
|
|
|
|
if (any_filter_consistent && filter_delay_blocks < 5) {
|
|
sane_filter_observed_ = true;
|
|
active_blocks_since_sane_filter_ = 0;
|
|
} else if (active_render) {
|
|
++active_blocks_since_sane_filter_;
|
|
}
|
|
|
|
bool sane_filter_recently_seen;
|
|
if (!sane_filter_observed_) {
|
|
sane_filter_recently_seen =
|
|
capture_block_counter_ <= 5 * kNumBlocksPerSecond;
|
|
} else {
|
|
sane_filter_recently_seen =
|
|
active_blocks_since_sane_filter_ <= 30 * kNumBlocksPerSecond;
|
|
}
|
|
|
|
if (any_filter_converged) {
|
|
recent_convergence_during_activity_ = true;
|
|
active_non_converged_sequence_size_ = 0;
|
|
non_converged_sequence_size_ = 0;
|
|
++num_converged_blocks_;
|
|
} else {
|
|
if (++non_converged_sequence_size_ > 20 * kNumBlocksPerSecond) {
|
|
num_converged_blocks_ = 0;
|
|
}
|
|
|
|
if (active_render &&
|
|
++active_non_converged_sequence_size_ > 60 * kNumBlocksPerSecond) {
|
|
recent_convergence_during_activity_ = false;
|
|
}
|
|
}
|
|
|
|
if (!all_filters_diverged) {
|
|
diverged_sequence_size_ = 0;
|
|
} else if (++diverged_sequence_size_ >= 60) {
|
|
// TODO(peah): Change these lines to ensure proper triggering of usable
|
|
// filter.
|
|
non_converged_sequence_size_ = kBlocksSinceConvergencedFilterInit;
|
|
}
|
|
|
|
if (active_non_converged_sequence_size_ > 60 * kNumBlocksPerSecond) {
|
|
finite_erl_recently_detected_ = false;
|
|
}
|
|
if (num_converged_blocks_ > 50) {
|
|
finite_erl_recently_detected_ = true;
|
|
}
|
|
|
|
if (bounded_erl_) {
|
|
transparency_activated_ = false;
|
|
} else if (finite_erl_recently_detected_) {
|
|
transparency_activated_ = false;
|
|
} else if (sane_filter_recently_seen && recent_convergence_during_activity_) {
|
|
transparency_activated_ = false;
|
|
} else {
|
|
const bool filter_should_have_converged =
|
|
strong_not_saturated_render_blocks_ > 6 * kNumBlocksPerSecond;
|
|
transparency_activated_ = filter_should_have_converged;
|
|
}
|
|
}
|
|
|
|
AecState::FilteringQualityAnalyzer::FilteringQualityAnalyzer(
|
|
const EchoCanceller3Config& config) {}
|
|
|
|
void AecState::FilteringQualityAnalyzer::Reset() {
|
|
usable_linear_estimate_ = false;
|
|
filter_update_blocks_since_reset_ = 0;
|
|
}
|
|
|
|
void AecState::FilteringQualityAnalyzer::Update(
|
|
bool active_render,
|
|
bool transparent_mode,
|
|
bool saturated_capture,
|
|
const absl::optional<DelayEstimate>& external_delay,
|
|
bool any_filter_converged) {
|
|
// Update blocks counter.
|
|
const bool filter_update = active_render && !saturated_capture;
|
|
filter_update_blocks_since_reset_ += filter_update ? 1 : 0;
|
|
filter_update_blocks_since_start_ += filter_update ? 1 : 0;
|
|
|
|
// Store convergence flag when observed.
|
|
convergence_seen_ = convergence_seen_ || any_filter_converged;
|
|
|
|
// Verify requirements for achieving a decent filter. The requirements for
|
|
// filter adaptation at call startup are more restrictive than after an
|
|
// in-call reset.
|
|
const bool sufficient_data_to_converge_at_startup =
|
|
filter_update_blocks_since_start_ > kNumBlocksPerSecond * 0.4f;
|
|
const bool sufficient_data_to_converge_at_reset =
|
|
sufficient_data_to_converge_at_startup &&
|
|
filter_update_blocks_since_reset_ > kNumBlocksPerSecond * 0.2f;
|
|
|
|
// The linear filter can only be used it has had time to converge.
|
|
usable_linear_estimate_ = sufficient_data_to_converge_at_startup &&
|
|
sufficient_data_to_converge_at_reset;
|
|
|
|
// The linear filter can only be used if an external delay or convergence have
|
|
// been identified
|
|
usable_linear_estimate_ =
|
|
usable_linear_estimate_ && (external_delay || convergence_seen_);
|
|
|
|
// If transparent mode is on, deactivate usign the linear filter.
|
|
usable_linear_estimate_ = usable_linear_estimate_ && !transparent_mode;
|
|
}
|
|
|
|
void AecState::SaturationDetector::Update(
|
|
rtc::ArrayView<const std::vector<float>> x,
|
|
bool saturated_capture,
|
|
bool usable_linear_estimate,
|
|
rtc::ArrayView<const SubtractorOutput> subtractor_output,
|
|
float echo_path_gain) {
|
|
saturated_echo_ = false;
|
|
if (!saturated_capture) {
|
|
return;
|
|
}
|
|
|
|
if (usable_linear_estimate) {
|
|
constexpr float kSaturationThreshold = 20000.f;
|
|
for (size_t ch = 0; ch < subtractor_output.size(); ++ch) {
|
|
saturated_echo_ =
|
|
saturated_echo_ ||
|
|
(subtractor_output[ch].s_main_max_abs > kSaturationThreshold ||
|
|
subtractor_output[ch].s_shadow_max_abs > kSaturationThreshold);
|
|
}
|
|
} else {
|
|
float max_sample = 0.f;
|
|
for (auto& channel : x) {
|
|
for (float sample : channel) {
|
|
max_sample = std::max(max_sample, fabsf(sample));
|
|
}
|
|
}
|
|
|
|
const float kMargin = 10.f;
|
|
float peak_echo_amplitude = max_sample * echo_path_gain * kMargin;
|
|
saturated_echo_ = saturated_echo_ || peak_echo_amplitude > 32000;
|
|
}
|
|
}
|
|
|
|
} // namespace webrtc
|