Level estimation and saturation protection stub.

The level estimator (AdaptiveModeLevelEstimator) produces a biased estimate of the speech level. In our model, we use another module (the SaturationProtector) to compute the bias. This CL contains the estimator and a stub of the saturation protector. Bug: webrtc:7494 Change-Id: I0df736d0346063f544fa680b4cc84177ea548545 Reviewed-on: https://webrtc-review.googlesource.com/64820 Commit-Queue: Alex Loiko <aleloi@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#22641}
2018-03-28 09:45:29 +02:00
parent e24c41ea45
commit 1e48e8095c
9 changed files with 269 additions and 4 deletions
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@ -548,6 +548,7 @@ if (rtc_include_tests) {
      "../../test:test_support",
      "../audio_coding:neteq_input_audio_tools",
      "aec_dump:mock_aec_dump_unittests",
+      "agc2:adaptive_digital_unittests",
      "agc2:fixed_digital_unittests",
      "test/conversational_speech:unittest",
      "vad:vad_unittests",
--- a/modules/audio_processing/agc2/BUILD.gn
+++ b/modules/audio_processing/agc2/BUILD.gn
@ -25,6 +25,8 @@ rtc_source_set("adaptive_digital") {
    "adaptive_mode_level_estimator.h",
    "noise_level_estimator.cc",
    "noise_level_estimator.h",
+    "saturation_protector.cc",
+    "saturation_protector.h",
  ]

  configs += [ "..:apm_debug_dump" ]
@ -126,3 +128,25 @@ rtc_source_set("fixed_digital_unittests") {
    "../../../rtc_base:rtc_base_tests_utils",
  ]
 }
+
+rtc_source_set("adaptive_digital_unittests") {
+  testonly = true
+  configs += [ "..:apm_debug_dump" ]
+
+  sources = [
+    "adaptive_mode_level_estimator_unittest.cc",
+  ]
+  deps = [
+    ":adaptive_digital",
+    ":common",
+    ":test_utils",
+    "..:apm_logging",
+    "..:audio_frame_view",
+    "../../../api:array_view",
+    "../../../common_audio",
+    "../../../rtc_base:checks",
+    "../../../rtc_base:rtc_base_approved",
+    "../../../rtc_base:rtc_base_tests_utils",
+    "../vad:vad_with_level",
+  ]
+}
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
@ -17,7 +17,9 @@
 namespace webrtc {

 AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
-    ApmDataDumper* apm_data_dumper) {}
+    ApmDataDumper* apm_data_dumper)
+    : saturation_protector_(apm_data_dumper),
+      apm_data_dumper_(apm_data_dumper) {}

 void AdaptiveModeLevelEstimator::UpdateEstimation(
    const VadWithLevel::LevelAndProbability& vad_data) {
@ -27,10 +29,40 @@ void AdaptiveModeLevelEstimator::UpdateEstimation(
  RTC_DCHECK_LT(vad_data.speech_peak_dbfs, 50.f);
  RTC_DCHECK_GE(vad_data.speech_probability, 0.f);
  RTC_DCHECK_LE(vad_data.speech_probability, 1.f);
+
+  if (vad_data.speech_probability < kVadConfidenceThreshold) {
+    DebugDumpEstimate();
+    return;
+  }
+
+  const bool buffer_is_full = buffer_size_ms_ >= kFullBufferSizeMs;
+  if (!buffer_is_full) {
+    buffer_size_ms_ += kFrameDurationMs;
+  }
+
+  const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f;
+
+  estimate_numerator_ = estimate_numerator_ * leak_factor +
+                        vad_data.speech_rms_dbfs * vad_data.speech_probability;
+  estimate_denominator_ =
+      estimate_denominator_ * leak_factor + vad_data.speech_probability;
+
+  last_estimate_with_offset_dbfs_ = estimate_numerator_ / estimate_denominator_;
+
+  saturation_protector_.UpdateMargin(vad_data, last_estimate_with_offset_dbfs_);
+  DebugDumpEstimate();
 }

 float AdaptiveModeLevelEstimator::LatestLevelEstimate() const {
-  // TODO(webrtc:7494): This is a stub. Add implementation.
-  return 0.f;
+  return rtc::SafeClamp<float>(
+      last_estimate_with_offset_dbfs_ + saturation_protector_.LastMargin(),
+      -90.f, 0.f);
+}
+
+void AdaptiveModeLevelEstimator::DebugDumpEstimate() {
+  apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_with_offset_dbfs",
+                            last_estimate_with_offset_dbfs_);
+  apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs",
+                            LatestLevelEstimate());
 }
 }  // namespace webrtc
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
@ -11,6 +11,7 @@
 #ifndef MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_
 #define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_

+#include "modules/audio_processing/agc2/saturation_protector.h"
 #include "modules/audio_processing/vad/vad_with_level.h"

 namespace webrtc {
@ -21,6 +22,16 @@ class AdaptiveModeLevelEstimator {
  explicit AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper);
  void UpdateEstimation(const VadWithLevel::LevelAndProbability& vad_data);
  float LatestLevelEstimate() const;
+
+ private:
+  void DebugDumpEstimate();
+
+  int buffer_size_ms_ = 0;
+  float last_estimate_with_offset_dbfs_ = kInitialSpeechLevelEstimateDbfs;
+  float estimate_numerator_ = 0.f;
+  float estimate_denominator_ = 0.f;
+  SaturationProtector saturation_protector_;
+  ApmDataDumper* const apm_data_dumper_;
 };

 }  // namespace webrtc
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"
+
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/gunit.h"
+
+namespace webrtc {
+namespace {
+void RunOnConstantLevel(int num_iterations,
+                        VadWithLevel::LevelAndProbability vad_data,
+                        AdaptiveModeLevelEstimator* level_estimator) {
+  for (int i = 0; i < num_iterations; ++i) {
+    level_estimator->UpdateEstimation(vad_data);  // By copy
+  }
+}
+}  // namespace
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
+     EstimatorShouldNotCrash) {
+  ApmDataDumper apm_data_dumper(0);
+  AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+  VadWithLevel::LevelAndProbability vad_data(1.f, -20.f, -10.f);
+  level_estimator.UpdateEstimation(vad_data);
+  static_cast<void>(level_estimator.LatestLevelEstimate());
+}
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) {
+  ApmDataDumper apm_data_dumper(0);
+  AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+  constexpr float kSpeechRmsDbfs = -15.f;
+  RunOnConstantLevel(
+      100,
+      VadWithLevel::LevelAndProbability(
+          1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs),
+      &level_estimator);
+
+  EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f);
+}
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
+     EstimatorIgnoresZeroProbabilityFrames) {
+  ApmDataDumper apm_data_dumper(0);
+  AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+  // Run for one second of fake audio.
+  constexpr float kSpeechRmsDbfs = -25.f;
+  RunOnConstantLevel(
+      100,
+      VadWithLevel::LevelAndProbability(
+          1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs),
+      &level_estimator);
+
+  // Run for one more second, but mark as not speech.
+  constexpr float kNoiseRmsDbfs = 0.f;
+  RunOnConstantLevel(
+      100, VadWithLevel::LevelAndProbability(0.f, kNoiseRmsDbfs, kNoiseRmsDbfs),
+      &level_estimator);
+
+  // Level should not have changed.
+  EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f);
+}
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) {
+  ApmDataDumper apm_data_dumper(0);
+  AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+  // Run for one 'window size' interval
+  constexpr float kInitialSpeechRmsDbfs = -30.f;
+  RunOnConstantLevel(
+      kFullBufferSizeMs / kFrameDurationMs,
+      VadWithLevel::LevelAndProbability(
+          1.f, kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
+          kInitialSpeechRmsDbfs),
+      &level_estimator);
+
+  // Run for one half 'window size' interval. This should not be enough to
+  // adapt.
+  constexpr float kDifferentSpeechRmsDbfs = -10.f;
+  // It should at most differ by 25% after one 'window size' interval.
+  const float kMaxDifferenceDb =
+      0.25 * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
+  RunOnConstantLevel(
+      static_cast<int>(kFullBufferSizeMs / kFrameDurationMs / 2),
+      VadWithLevel::LevelAndProbability(
+          1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
+          kDifferentSpeechRmsDbfs),
+      &level_estimator);
+  EXPECT_GT(
+      std::abs(kDifferentSpeechRmsDbfs - level_estimator.LatestLevelEstimate()),
+      kMaxDifferenceDb);
+
+  // Run for some more time. Afterwards, we should have adapted.
+  RunOnConstantLevel(
+      static_cast<int>(3 * kFullBufferSizeMs / kFrameDurationMs),
+      VadWithLevel::LevelAndProbability(
+          1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
+          kDifferentSpeechRmsDbfs),
+      &level_estimator);
+  EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kDifferentSpeechRmsDbfs,
+              kMaxDifferenceDb);
+}
+
+}  // namespace webrtc
--- a/modules/audio_processing/agc2/agc2_common.h
+++ b/modules/audio_processing/agc2/agc2_common.h
@ -27,6 +27,18 @@ constexpr size_t kMaximalNumberOfSamplesPerChannel = 480;

 constexpr float kAttackFilterConstant = 0.f;

+// Used in the Level Estimator for deciding when to update the speech
+// level estimate.
+constexpr float kVadConfidenceThreshold = 0.9f;
+
+// The amount of 'memory' of the Level Estimator. Decides leak factors.
+constexpr float kFullBufferSizeMs = 1000.f;
+constexpr float kFullBufferLeakFactor = 1.f - 1.f / kFullBufferSizeMs;
+
+constexpr float kInitialSpeechLevelEstimateDbfs = -30.f;
+
+constexpr float kInitialSaturationMarginDb = 17.f;
+
 // This is computed from kDecayMs by
 // 10 ** (-1/20 * subframe_duration / kDecayMs).
 // |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.
--- a/modules/audio_processing/agc2/noise_level_estimator.h
+++ b/modules/audio_processing/agc2/noise_level_estimator.h
@ -20,7 +20,7 @@ class NoiseLevelEstimator {
 public:
  NoiseLevelEstimator() {}

-  // Returns the estimated noise level in DbFS.
+  // Returns the estimated noise level in dBFS.
  float Analyze(AudioFrameView<const float> frame);

 private:
--- a/modules/audio_processing/agc2/saturation_protector.cc
+++ b/modules/audio_processing/agc2/saturation_protector.cc
@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/saturation_protector.h"
+
+#include <algorithm>
+
+#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/numerics/safe_minmax.h"
+
+namespace webrtc {
+
+SaturationProtector::SaturationProtector(ApmDataDumper* apm_data_dumper) {}
+
+void SaturationProtector::UpdateMargin(
+    const VadWithLevel::LevelAndProbability& vad_data,
+    float last_speech_level_estimate) {}
+
+float SaturationProtector::LastMargin() const {
+  return kInitialSaturationMarginDb;
+}
+}  // namespace webrtc
--- a/modules/audio_processing/agc2/saturation_protector.h
+++ b/modules/audio_processing/agc2/saturation_protector.h
@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
+#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
+
+#include <array>
+
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/vad/vad_with_level.h"
+
+namespace webrtc {
+
+class ApmDataDumper;
+
+class SaturationProtector {
+ public:
+  explicit SaturationProtector(ApmDataDumper* apm_data_dumper);
+
+  // Update and return margin estimate. This method should be called
+  // whenever a frame is reliably classified as 'speech'.
+  //
+  // Returned value is in DB scale.
+  void UpdateMargin(const VadWithLevel::LevelAndProbability& vad_data,
+                    float last_speech_level_estimate_dbfs);
+
+  // Returns latest computed margin. Used in cases when speech is not
+  // detected.
+  float LastMargin() const;
+};
+
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_