AGC2 lightweight noise floor estimator

The current noise level estimator has a bug due to which the estimated
level decays to the lower bound in a few seconds when speech is observed.
Instead of fixing the current implementation, which is based on a
stationarity classifier, an alternative, lightweight, noise floor
estimator has been added and tuned for AGC2.

Tested on several AEC dumps including HW mute, music and fast talking.

Bug: webrtc:7494
Change-Id: Iae4cff9fc955a716878f830957e893cd5bc59446
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/214133
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#33733}
This commit is contained in:
Alessio Bazzica
2021-04-14 16:17:09 +02:00
committed by Commit Bot
parent 3ab7a55f6e
commit 61982a7f2d
10 changed files with 246 additions and 42 deletions

View File

@ -20,6 +20,11 @@
namespace webrtc {
namespace {
using AdaptiveDigitalConfig =
AudioProcessing::Config::GainController2::AdaptiveDigital;
using NoiseEstimatorType =
AudioProcessing::Config::GainController2::NoiseEstimator;
void DumpDebugData(const AdaptiveDigitalGainApplier::FrameInfo& info,
ApmDataDumper& dumper) {
dumper.DumpRaw("agc2_vad_probability", info.vad_result.speech_probability);
@ -35,7 +40,7 @@ constexpr float kMaxOutputNoiseLevelDbfs = -50.0f;
// Detects the available CPU features and applies any kill-switches.
AvailableCpuFeatures GetAllowedCpuFeatures(
const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
const AdaptiveDigitalConfig& config) {
AvailableCpuFeatures features = GetAvailableCpuFeatures();
if (!config.sse2_allowed) {
features.sse2 = false;
@ -49,6 +54,20 @@ AvailableCpuFeatures GetAllowedCpuFeatures(
return features;
}
std::unique_ptr<NoiseLevelEstimator> CreateNoiseLevelEstimator(
NoiseEstimatorType estimator_type,
ApmDataDumper* apm_data_dumper) {
switch (estimator_type) {
case NoiseEstimatorType::kStationaryNoise:
return CreateStationaryNoiseEstimator(apm_data_dumper);
case NoiseEstimatorType::kNoiseFloor:
return CreateNoiseFloorEstimator(apm_data_dumper);
}
}
constexpr NoiseEstimatorType kDefaultNoiseLevelEstimatorType =
NoiseEstimatorType::kNoiseFloor;
} // namespace
AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper)
@ -58,31 +77,32 @@ AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper)
kMaxGainChangePerSecondDb,
kMaxOutputNoiseLevelDbfs),
apm_data_dumper_(apm_data_dumper),
noise_level_estimator_(CreateNoiseLevelEstimator(apm_data_dumper)) {
noise_level_estimator_(
CreateNoiseLevelEstimator(kDefaultNoiseLevelEstimatorType,
apm_data_dumper)) {
RTC_DCHECK(apm_data_dumper);
}
AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper,
const AudioProcessing::Config::GainController2& config)
const AdaptiveDigitalConfig& config)
: speech_level_estimator_(
apm_data_dumper,
config.adaptive_digital.level_estimator,
config.adaptive_digital
.level_estimator_adjacent_speech_frames_threshold,
config.adaptive_digital.initial_saturation_margin_db,
config.adaptive_digital.extra_saturation_margin_db),
vad_(config.adaptive_digital.vad_reset_period_ms,
config.adaptive_digital.vad_probability_attack,
GetAllowedCpuFeatures(config.adaptive_digital)),
gain_applier_(
apm_data_dumper,
config.adaptive_digital.gain_applier_adjacent_speech_frames_threshold,
config.adaptive_digital.max_gain_change_db_per_second,
config.adaptive_digital.max_output_noise_level_dbfs),
config.level_estimator,
config.level_estimator_adjacent_speech_frames_threshold,
config.initial_saturation_margin_db,
config.extra_saturation_margin_db),
vad_(config.vad_reset_period_ms,
config.vad_probability_attack,
GetAllowedCpuFeatures(config)),
gain_applier_(apm_data_dumper,
config.gain_applier_adjacent_speech_frames_threshold,
config.max_gain_change_db_per_second,
config.max_output_noise_level_dbfs),
apm_data_dumper_(apm_data_dumper),
noise_level_estimator_(CreateNoiseLevelEstimator(apm_data_dumper)) {
noise_level_estimator_(
CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)) {
RTC_DCHECK(apm_data_dumper);
if (!config.adaptive_digital.use_saturation_protector) {
if (!config.use_saturation_protector) {
RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled.";
}
}

View File

@ -29,8 +29,9 @@ class AdaptiveAgc {
public:
explicit AdaptiveAgc(ApmDataDumper* apm_data_dumper);
// TODO(crbug.com/webrtc/7494): Remove ctor above.
AdaptiveAgc(ApmDataDumper* apm_data_dumper,
const AudioProcessing::Config::GainController2& config);
AdaptiveAgc(
ApmDataDumper* apm_data_dumper,
const AudioProcessing::Config::GainController2::AdaptiveDigital& config);
~AdaptiveAgc();
// Analyzes `frame` and applies a digital adaptive gain to it. Takes into

View File

@ -35,7 +35,7 @@ constexpr float kLimiterThresholdForAgcGainDbfs = -kHeadroomDbfs;
// This is the threshold for speech. Speech frames are used for updating the
// speech level, measuring the amount of speech, and decide when to allow target
// gain reduction.
constexpr float kVadConfidenceThreshold = 0.9f;
constexpr float kVadConfidenceThreshold = 0.95f;
// The amount of 'memory' of the Level Estimator. Decides leak factors.
constexpr int kFullBufferSizeMs = 1200;

View File

@ -46,13 +46,15 @@ class NoiseLevelEstimatorImpl : public NoiseLevelEstimator {
public:
NoiseLevelEstimatorImpl(ApmDataDumper* data_dumper)
: data_dumper_(data_dumper), signal_classifier_(data_dumper) {
Initialize(48000);
// Initially assume that 48 kHz will be used. `Analyze()` will detect the
// used sample rate and call `Initialize()` again if needed.
Initialize(/*sample_rate_hz=*/48000);
}
NoiseLevelEstimatorImpl(const NoiseLevelEstimatorImpl&) = delete;
NoiseLevelEstimatorImpl& operator=(const NoiseLevelEstimatorImpl&) = delete;
~NoiseLevelEstimatorImpl() = default;
float Analyze(const AudioFrameView<const float>& frame) {
float Analyze(const AudioFrameView<const float>& frame) override {
data_dumper_->DumpRaw("agc2_noise_level_estimator_hold_counter",
noise_energy_hold_counter_);
const int sample_rate_hz =
@ -122,6 +124,7 @@ class NoiseLevelEstimatorImpl : public NoiseLevelEstimator {
sample_rate_hz_ = sample_rate_hz;
noise_energy_ = 1.0f;
first_update_ = true;
// Initialize the minimum noise energy to -84 dBFS.
min_noise_energy_ = sample_rate_hz * 2.0f * 2.0f / kFramesPerSecond;
noise_energy_hold_counter_ = 0;
signal_classifier_.Initialize(sample_rate_hz);
@ -136,11 +139,122 @@ class NoiseLevelEstimatorImpl : public NoiseLevelEstimator {
SignalClassifier signal_classifier_;
};
// Updates the noise floor with instant decay and slow attack. This tuning is
// specific for AGC2, so that (i) it can promptly increase the gain if the noise
// floor drops (instant decay) and (ii) in case of music or fast speech, due to
// which the noise floor can be overestimated, the gain reduction is slowed
// down.
float SmoothNoiseFloorEstimate(float current_estimate, float new_estimate) {
constexpr float kAttack = 0.5f;
if (current_estimate < new_estimate) {
// Attack phase.
return kAttack * new_estimate + (1.0f - kAttack) * current_estimate;
}
// Instant attack.
return new_estimate;
}
class NoiseFloorEstimator : public NoiseLevelEstimator {
public:
// Update the noise floor every 5 seconds.
static constexpr int kUpdatePeriodNumFrames = 500;
static_assert(kUpdatePeriodNumFrames >= 200,
"A too small value may cause noise level overestimation.");
static_assert(kUpdatePeriodNumFrames <= 1500,
"A too large value may make AGC2 slow at reacting to increased "
"noise levels.");
NoiseFloorEstimator(ApmDataDumper* data_dumper) : data_dumper_(data_dumper) {
// Initially assume that 48 kHz will be used. `Analyze()` will detect the
// used sample rate and call `Initialize()` again if needed.
Initialize(/*sample_rate_hz=*/48000);
}
NoiseFloorEstimator(const NoiseFloorEstimator&) = delete;
NoiseFloorEstimator& operator=(const NoiseFloorEstimator&) = delete;
~NoiseFloorEstimator() = default;
float Analyze(const AudioFrameView<const float>& frame) override {
// Detect sample rate changes.
const int sample_rate_hz =
static_cast<int>(frame.samples_per_channel() * kFramesPerSecond);
if (sample_rate_hz != sample_rate_hz_) {
Initialize(sample_rate_hz);
}
const float frame_energy = FrameEnergy(frame);
if (frame_energy <= min_noise_energy_) {
// Ignore frames when muted or below the minimum measurable energy.
data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level",
noise_energy_);
return EnergyToDbfs(noise_energy_, frame.samples_per_channel());
}
if (preliminary_noise_energy_set_) {
preliminary_noise_energy_ =
std::min(preliminary_noise_energy_, frame_energy);
} else {
preliminary_noise_energy_ = frame_energy;
preliminary_noise_energy_set_ = true;
}
data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level",
preliminary_noise_energy_);
if (counter_ == 0) {
// Full period observed.
first_period_ = false;
// Update the estimated noise floor energy with the preliminary
// estimation.
noise_energy_ = SmoothNoiseFloorEstimate(
/*current_estimate=*/noise_energy_,
/*new_estimate=*/preliminary_noise_energy_);
// Reset for a new observation period.
counter_ = kUpdatePeriodNumFrames;
preliminary_noise_energy_set_ = false;
} else if (first_period_) {
// While analyzing the signal during the initial period, continuously
// update the estimated noise energy, which is monotonic.
noise_energy_ = preliminary_noise_energy_;
counter_--;
} else {
// During the observation period it's only allowed to lower the energy.
noise_energy_ = std::min(noise_energy_, preliminary_noise_energy_);
counter_--;
}
return EnergyToDbfs(noise_energy_, frame.samples_per_channel());
}
private:
void Initialize(int sample_rate_hz) {
sample_rate_hz_ = sample_rate_hz;
first_period_ = true;
preliminary_noise_energy_set_ = false;
// Initialize the minimum noise energy to -84 dBFS.
min_noise_energy_ = sample_rate_hz * 2.0f * 2.0f / kFramesPerSecond;
preliminary_noise_energy_ = min_noise_energy_;
noise_energy_ = min_noise_energy_;
counter_ = kUpdatePeriodNumFrames;
}
ApmDataDumper* const data_dumper_;
int sample_rate_hz_;
float min_noise_energy_;
bool first_period_;
bool preliminary_noise_energy_set_;
float preliminary_noise_energy_;
float noise_energy_;
int counter_;
};
} // namespace
std::unique_ptr<NoiseLevelEstimator> CreateNoiseLevelEstimator(
std::unique_ptr<NoiseLevelEstimator> CreateStationaryNoiseEstimator(
ApmDataDumper* data_dumper) {
return std::make_unique<NoiseLevelEstimatorImpl>(data_dumper);
}
std::unique_ptr<NoiseLevelEstimator> CreateNoiseFloorEstimator(
ApmDataDumper* data_dumper) {
return std::make_unique<NoiseFloorEstimator>(data_dumper);
}
} // namespace webrtc

View File

@ -28,7 +28,11 @@ class NoiseLevelEstimator {
};
// Creates a noise level estimator based on stationarity detection.
std::unique_ptr<NoiseLevelEstimator> CreateNoiseLevelEstimator(
std::unique_ptr<NoiseLevelEstimator> CreateStationaryNoiseEstimator(
ApmDataDumper* data_dumper);
// Creates a noise level estimator based on noise floor detection.
std::unique_ptr<NoiseLevelEstimator> CreateNoiseFloorEstimator(
ApmDataDumper* data_dumper);
} // namespace webrtc

View File

@ -11,6 +11,7 @@
#include "modules/audio_processing/agc2/noise_level_estimator.h"
#include <array>
#include <cmath>
#include <functional>
#include <limits>
@ -29,21 +30,19 @@ constexpr int kFramesPerSecond = 100;
// Runs the noise estimator on audio generated by 'sample_generator'
// for kNumIterations. Returns the last noise level estimate.
float RunEstimator(rtc::FunctionView<float()> sample_generator,
NoiseLevelEstimator& estimator,
int sample_rate_hz) {
ApmDataDumper data_dumper(0);
auto estimator = CreateNoiseLevelEstimator(&data_dumper);
const int samples_per_channel =
rtc::CheckedDivExact(sample_rate_hz, kFramesPerSecond);
VectorFloatFrame signal(1, samples_per_channel, 0.0f);
for (int i = 0; i < kNumIterations; ++i) {
AudioFrameView<float> frame_view = signal.float_frame_view();
for (int j = 0; j < samples_per_channel; ++j) {
frame_view.channel(0)[j] = sample_generator();
}
estimator->Analyze(frame_view);
estimator.Analyze(frame_view);
}
return estimator->Analyze(signal.float_frame_view());
return estimator.Analyze(signal.float_frame_view());
}
class NoiseEstimatorParametrization : public ::testing::TestWithParam<int> {
@ -53,32 +52,82 @@ class NoiseEstimatorParametrization : public ::testing::TestWithParam<int> {
// White random noise is stationary, but does not trigger the detector
// every frame due to the randomness.
TEST_P(NoiseEstimatorParametrization, RandomNoise) {
TEST_P(NoiseEstimatorParametrization, StationaryNoiseEstimatorWithRandomNoise) {
ApmDataDumper data_dumper(0);
auto estimator = CreateStationaryNoiseEstimator(&data_dumper);
test::WhiteNoiseGenerator gen(/*min_amplitude=*/test::kMinS16,
/*max_amplitude=*/test::kMaxS16);
const float noise_level_dbfs = RunEstimator(gen, sample_rate_hz());
const float noise_level_dbfs =
RunEstimator(gen, *estimator, sample_rate_hz());
EXPECT_NEAR(noise_level_dbfs, -5.5f, 1.0f);
}
// Sine curves are (very) stationary. They trigger the detector all
// the time. Except for a few initial frames.
TEST_P(NoiseEstimatorParametrization, SineTone) {
TEST_P(NoiseEstimatorParametrization, StationaryNoiseEstimatorWithSineTone) {
ApmDataDumper data_dumper(0);
auto estimator = CreateStationaryNoiseEstimator(&data_dumper);
test::SineGenerator gen(/*amplitude=*/test::kMaxS16, /*frequency_hz=*/600.0f,
sample_rate_hz());
const float noise_level_dbfs = RunEstimator(gen, sample_rate_hz());
const float noise_level_dbfs =
RunEstimator(gen, *estimator, sample_rate_hz());
EXPECT_NEAR(noise_level_dbfs, -3.0f, 1.0f);
}
// Pulses are transient if they are far enough apart. They shouldn't
// trigger the noise detector.
TEST_P(NoiseEstimatorParametrization, PulseTone) {
TEST_P(NoiseEstimatorParametrization, StationaryNoiseEstimatorWithPulseTone) {
ApmDataDumper data_dumper(0);
auto estimator = CreateStationaryNoiseEstimator(&data_dumper);
test::PulseGenerator gen(/*pulse_amplitude=*/test::kMaxS16,
/*no_pulse_amplitude=*/10.0f, /*frequency_hz=*/20.0f,
sample_rate_hz());
const int noise_level_dbfs = RunEstimator(gen, sample_rate_hz());
const int noise_level_dbfs = RunEstimator(gen, *estimator, sample_rate_hz());
EXPECT_NEAR(noise_level_dbfs, -79.0f, 1.0f);
}
// Checks that full scale white noise maps to about -5.5 dBFS.
TEST_P(NoiseEstimatorParametrization, NoiseFloorEstimatorWithRandomNoise) {
ApmDataDumper data_dumper(0);
auto estimator = CreateNoiseFloorEstimator(&data_dumper);
test::WhiteNoiseGenerator gen(/*min_amplitude=*/test::kMinS16,
/*max_amplitude=*/test::kMaxS16);
const float noise_level_dbfs =
RunEstimator(gen, *estimator, sample_rate_hz());
EXPECT_NEAR(noise_level_dbfs, -5.5f, 0.5f);
}
// Checks that a full scale sine wave maps to about -3 dBFS.
TEST_P(NoiseEstimatorParametrization, NoiseFloorEstimatorWithSineTone) {
ApmDataDumper data_dumper(0);
auto estimator = CreateNoiseFloorEstimator(&data_dumper);
test::SineGenerator gen(/*amplitude=*/test::kMaxS16, /*frequency_hz=*/600.0f,
sample_rate_hz());
const float noise_level_dbfs =
RunEstimator(gen, *estimator, sample_rate_hz());
EXPECT_NEAR(noise_level_dbfs, -3.0f, 0.1f);
}
// Check that sufficiently spaced periodic pulses do not raise the estimated
// noise floor, which is determined by the amplitude of the non-pulse samples.
TEST_P(NoiseEstimatorParametrization, NoiseFloorEstimatorWithPulseTone) {
ApmDataDumper data_dumper(0);
auto estimator = CreateNoiseFloorEstimator(&data_dumper);
constexpr float kNoPulseAmplitude = 10.0f;
test::PulseGenerator gen(/*pulse_amplitude=*/test::kMaxS16, kNoPulseAmplitude,
/*frequency_hz=*/20.0f, sample_rate_hz());
const int noise_level_dbfs = RunEstimator(gen, *estimator, sample_rate_hz());
const float expected_noise_floor_dbfs =
20.0f * std::log10f(kNoPulseAmplitude / test::kMaxS16);
EXPECT_NEAR(noise_level_dbfs, expected_noise_floor_dbfs, 0.5f);
}
INSTANTIATE_TEST_SUITE_P(GainController2NoiseEstimator,
NoiseEstimatorParametrization,
::testing::Values(8000, 16000, 32000, 48000));

View File

@ -90,7 +90,8 @@ void GainController2::ApplyConfig(
}
gain_applier_.SetGainFactor(DbToRatio(config_.fixed_digital.gain_db));
if (config_.adaptive_digital.enabled) {
adaptive_agc_ = std::make_unique<AdaptiveAgc>(&data_dumper_, config_);
adaptive_agc_ =
std::make_unique<AdaptiveAgc>(&data_dumper_, config_.adaptive_digital);
} else {
adaptive_agc_.reset();
}

View File

@ -351,7 +351,7 @@ TEST(GainController2, UsageNoSaturationMargin) {
config.adaptive_digital.extra_saturation_margin_db = 0.f;
gain_controller2.ApplyConfig(config);
EXPECT_GT(GainAfterProcessingFile(&gain_controller2), 2.f);
EXPECT_GT(GainAfterProcessingFile(&gain_controller2), 1.9f);
}
} // namespace test

View File

@ -57,6 +57,17 @@ std::string GainController2LevelEstimatorToString(
RTC_CHECK_NOTREACHED();
}
std::string GainController2NoiseEstimatorToString(
const Agc2Config::NoiseEstimator& type) {
switch (type) {
case Agc2Config::NoiseEstimator::kStationaryNoise:
return "StationaryNoise";
case Agc2Config::NoiseEstimator::kNoiseFloor:
return "NoiseFloor";
}
RTC_CHECK_NOTREACHED();
}
} // namespace
constexpr int AudioProcessing::kNativeSampleRatesHz[];
@ -160,7 +171,9 @@ std::string AudioProcessing::Config::ToString() const {
<< ", fixed_digital: { gain_db: "
<< gain_controller2.fixed_digital.gain_db
<< " }, adaptive_digital: { enabled: "
<< gain_controller2.adaptive_digital.enabled
<< gain_controller2.adaptive_digital.enabled << ", noise_estimator: "
<< GainController2NoiseEstimatorToString(
gain_controller2.adaptive_digital.noise_estimator)
<< ", level_estimator: { vad_probability_attack: "
<< gain_controller2.adaptive_digital.vad_probability_attack << ", type: "
<< GainController2LevelEstimatorToString(

View File

@ -350,21 +350,23 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
}
enum LevelEstimator { kRms, kPeak };
enum NoiseEstimator { kStationaryNoise, kNoiseFloor };
bool enabled = false;
struct FixedDigital {
float gain_db = 0.0f;
} fixed_digital;
struct AdaptiveDigital {
bool enabled = false;
NoiseEstimator noise_estimator = kNoiseFloor;
int vad_reset_period_ms = 1500;
float vad_probability_attack = 0.3f;
float vad_probability_attack = 0.9f;
LevelEstimator level_estimator = kRms;
int level_estimator_adjacent_speech_frames_threshold = 6;
int level_estimator_adjacent_speech_frames_threshold = 11;
// TODO(crbug.com/webrtc/7494): Remove `use_saturation_protector`.
bool use_saturation_protector = true;
float initial_saturation_margin_db = 20.0f;
float extra_saturation_margin_db = 5.0f;
int gain_applier_adjacent_speech_frames_threshold = 6;
int gain_applier_adjacent_speech_frames_threshold = 11;
float max_gain_change_db_per_second = 3.0f;
float max_output_noise_level_dbfs = -55.0f;
bool sse2_allowed = true;