Level estimation and saturation protection stub.

The level estimator (AdaptiveModeLevelEstimator) produces a biased
estimate of the speech level. In our model, we use another module
(the SaturationProtector) to compute the bias. This CL contains the
estimator and a stub of the saturation protector.

Bug: webrtc:7494
Change-Id: I0df736d0346063f544fa680b4cc84177ea548545
Reviewed-on: https://webrtc-review.googlesource.com/64820
Commit-Queue: Alex Loiko <aleloi@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#22641}
This commit is contained in:
Alex Loiko
2018-03-28 09:45:29 +02:00
committed by Commit Bot
parent e24c41ea45
commit 1e48e8095c
9 changed files with 269 additions and 4 deletions

View File

@ -548,6 +548,7 @@ if (rtc_include_tests) {
"../../test:test_support",
"../audio_coding:neteq_input_audio_tools",
"aec_dump:mock_aec_dump_unittests",
"agc2:adaptive_digital_unittests",
"agc2:fixed_digital_unittests",
"test/conversational_speech:unittest",
"vad:vad_unittests",

View File

@ -25,6 +25,8 @@ rtc_source_set("adaptive_digital") {
"adaptive_mode_level_estimator.h",
"noise_level_estimator.cc",
"noise_level_estimator.h",
"saturation_protector.cc",
"saturation_protector.h",
]
configs += [ "..:apm_debug_dump" ]
@ -126,3 +128,25 @@ rtc_source_set("fixed_digital_unittests") {
"../../../rtc_base:rtc_base_tests_utils",
]
}
rtc_source_set("adaptive_digital_unittests") {
testonly = true
configs += [ "..:apm_debug_dump" ]
sources = [
"adaptive_mode_level_estimator_unittest.cc",
]
deps = [
":adaptive_digital",
":common",
":test_utils",
"..:apm_logging",
"..:audio_frame_view",
"../../../api:array_view",
"../../../common_audio",
"../../../rtc_base:checks",
"../../../rtc_base:rtc_base_approved",
"../../../rtc_base:rtc_base_tests_utils",
"../vad:vad_with_level",
]
}

View File

@ -17,7 +17,9 @@
namespace webrtc {
AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
ApmDataDumper* apm_data_dumper) {}
ApmDataDumper* apm_data_dumper)
: saturation_protector_(apm_data_dumper),
apm_data_dumper_(apm_data_dumper) {}
void AdaptiveModeLevelEstimator::UpdateEstimation(
const VadWithLevel::LevelAndProbability& vad_data) {
@ -27,10 +29,40 @@ void AdaptiveModeLevelEstimator::UpdateEstimation(
RTC_DCHECK_LT(vad_data.speech_peak_dbfs, 50.f);
RTC_DCHECK_GE(vad_data.speech_probability, 0.f);
RTC_DCHECK_LE(vad_data.speech_probability, 1.f);
if (vad_data.speech_probability < kVadConfidenceThreshold) {
DebugDumpEstimate();
return;
}
const bool buffer_is_full = buffer_size_ms_ >= kFullBufferSizeMs;
if (!buffer_is_full) {
buffer_size_ms_ += kFrameDurationMs;
}
const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f;
estimate_numerator_ = estimate_numerator_ * leak_factor +
vad_data.speech_rms_dbfs * vad_data.speech_probability;
estimate_denominator_ =
estimate_denominator_ * leak_factor + vad_data.speech_probability;
last_estimate_with_offset_dbfs_ = estimate_numerator_ / estimate_denominator_;
saturation_protector_.UpdateMargin(vad_data, last_estimate_with_offset_dbfs_);
DebugDumpEstimate();
}
float AdaptiveModeLevelEstimator::LatestLevelEstimate() const {
// TODO(webrtc:7494): This is a stub. Add implementation.
return 0.f;
return rtc::SafeClamp<float>(
last_estimate_with_offset_dbfs_ + saturation_protector_.LastMargin(),
-90.f, 0.f);
}
void AdaptiveModeLevelEstimator::DebugDumpEstimate() {
apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_with_offset_dbfs",
last_estimate_with_offset_dbfs_);
apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs",
LatestLevelEstimate());
}
} // namespace webrtc

View File

@ -11,6 +11,7 @@
#ifndef MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_
#define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_
#include "modules/audio_processing/agc2/saturation_protector.h"
#include "modules/audio_processing/vad/vad_with_level.h"
namespace webrtc {
@ -21,6 +22,16 @@ class AdaptiveModeLevelEstimator {
explicit AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper);
void UpdateEstimation(const VadWithLevel::LevelAndProbability& vad_data);
float LatestLevelEstimate() const;
private:
void DebugDumpEstimate();
int buffer_size_ms_ = 0;
float last_estimate_with_offset_dbfs_ = kInitialSpeechLevelEstimateDbfs;
float estimate_numerator_ = 0.f;
float estimate_denominator_ = 0.f;
SaturationProtector saturation_protector_;
ApmDataDumper* const apm_data_dumper_;
};
} // namespace webrtc

View File

@ -0,0 +1,115 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"
#include "modules/audio_processing/agc2/agc2_common.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/gunit.h"
namespace webrtc {
namespace {
void RunOnConstantLevel(int num_iterations,
VadWithLevel::LevelAndProbability vad_data,
AdaptiveModeLevelEstimator* level_estimator) {
for (int i = 0; i < num_iterations; ++i) {
level_estimator->UpdateEstimation(vad_data); // By copy
}
}
} // namespace
TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
EstimatorShouldNotCrash) {
ApmDataDumper apm_data_dumper(0);
AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
VadWithLevel::LevelAndProbability vad_data(1.f, -20.f, -10.f);
level_estimator.UpdateEstimation(vad_data);
static_cast<void>(level_estimator.LatestLevelEstimate());
}
TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) {
ApmDataDumper apm_data_dumper(0);
AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
constexpr float kSpeechRmsDbfs = -15.f;
RunOnConstantLevel(
100,
VadWithLevel::LevelAndProbability(
1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs),
&level_estimator);
EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f);
}
TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
EstimatorIgnoresZeroProbabilityFrames) {
ApmDataDumper apm_data_dumper(0);
AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
// Run for one second of fake audio.
constexpr float kSpeechRmsDbfs = -25.f;
RunOnConstantLevel(
100,
VadWithLevel::LevelAndProbability(
1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs),
&level_estimator);
// Run for one more second, but mark as not speech.
constexpr float kNoiseRmsDbfs = 0.f;
RunOnConstantLevel(
100, VadWithLevel::LevelAndProbability(0.f, kNoiseRmsDbfs, kNoiseRmsDbfs),
&level_estimator);
// Level should not have changed.
EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f);
}
TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) {
ApmDataDumper apm_data_dumper(0);
AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
// Run for one 'window size' interval
constexpr float kInitialSpeechRmsDbfs = -30.f;
RunOnConstantLevel(
kFullBufferSizeMs / kFrameDurationMs,
VadWithLevel::LevelAndProbability(
1.f, kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
kInitialSpeechRmsDbfs),
&level_estimator);
// Run for one half 'window size' interval. This should not be enough to
// adapt.
constexpr float kDifferentSpeechRmsDbfs = -10.f;
// It should at most differ by 25% after one 'window size' interval.
const float kMaxDifferenceDb =
0.25 * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
RunOnConstantLevel(
static_cast<int>(kFullBufferSizeMs / kFrameDurationMs / 2),
VadWithLevel::LevelAndProbability(
1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
kDifferentSpeechRmsDbfs),
&level_estimator);
EXPECT_GT(
std::abs(kDifferentSpeechRmsDbfs - level_estimator.LatestLevelEstimate()),
kMaxDifferenceDb);
// Run for some more time. Afterwards, we should have adapted.
RunOnConstantLevel(
static_cast<int>(3 * kFullBufferSizeMs / kFrameDurationMs),
VadWithLevel::LevelAndProbability(
1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
kDifferentSpeechRmsDbfs),
&level_estimator);
EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kDifferentSpeechRmsDbfs,
kMaxDifferenceDb);
}
} // namespace webrtc

View File

@ -27,6 +27,18 @@ constexpr size_t kMaximalNumberOfSamplesPerChannel = 480;
constexpr float kAttackFilterConstant = 0.f;
// Used in the Level Estimator for deciding when to update the speech
// level estimate.
constexpr float kVadConfidenceThreshold = 0.9f;
// The amount of 'memory' of the Level Estimator. Decides leak factors.
constexpr float kFullBufferSizeMs = 1000.f;
constexpr float kFullBufferLeakFactor = 1.f - 1.f / kFullBufferSizeMs;
constexpr float kInitialSpeechLevelEstimateDbfs = -30.f;
constexpr float kInitialSaturationMarginDb = 17.f;
// This is computed from kDecayMs by
// 10 ** (-1/20 * subframe_duration / kDecayMs).
// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.

View File

@ -20,7 +20,7 @@ class NoiseLevelEstimator {
public:
NoiseLevelEstimator() {}
// Returns the estimated noise level in DbFS.
// Returns the estimated noise level in dBFS.
float Analyze(AudioFrameView<const float> frame);
private:

View File

@ -0,0 +1,29 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/saturation_protector.h"
#include <algorithm>
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/numerics/safe_minmax.h"
namespace webrtc {
SaturationProtector::SaturationProtector(ApmDataDumper* apm_data_dumper) {}
void SaturationProtector::UpdateMargin(
const VadWithLevel::LevelAndProbability& vad_data,
float last_speech_level_estimate) {}
float SaturationProtector::LastMargin() const {
return kInitialSaturationMarginDb;
}
} // namespace webrtc

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
#include <array>
#include "modules/audio_processing/agc2/agc2_common.h"
#include "modules/audio_processing/vad/vad_with_level.h"
namespace webrtc {
class ApmDataDumper;
class SaturationProtector {
public:
explicit SaturationProtector(ApmDataDumper* apm_data_dumper);
// Update and return margin estimate. This method should be called
// whenever a frame is reliably classified as 'speech'.
//
// Returned value is in DB scale.
void UpdateMargin(const VadWithLevel::LevelAndProbability& vad_data,
float last_speech_level_estimate_dbfs);
// Returns latest computed margin. Used in cases when speech is not
// detected.
float LastMargin() const;
};
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_