AudioProcessingImpl: Add a VAD submodule

Add a VoiceActivityDetectorWrapper submodule in AudioProcessingImpl and enable injecting speech probability into GainController2. Bug: webrtc:13663 Change-Id: I05e13b737d085b45ac8ce76660191867c56834c2 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265166 Commit-Queue: Hanna Silen <silen@webrtc.org> Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/main@{#37275}
2022-06-16 16:35:45 +02:00
parent ff45105b42
commit 0c1ad2992b
7 changed files with 348 additions and 17 deletions
--- a/modules/audio_processing/gain_controller2_unittest.cc
+++ b/modules/audio_processing/gain_controller2_unittest.cc
@ -47,7 +47,7 @@ float RunAgc2WithConstantInput(GainController2& agc2,
  // Give time to the level estimator to converge.
  for (int i = 0; i < num_frames + 1; ++i) {
    SetAudioBufferSamples(input_level, ab);
-    agc2.Process(&ab);
+    agc2.Process(/*speech_probability=*/absl::nullopt, &ab);
  }

  // Return the last sample from the last processed frame.
@ -62,7 +62,8 @@ std::unique_ptr<GainController2> CreateAgc2FixedDigitalMode(
  config.fixed_digital.gain_db = fixed_gain_db;
  EXPECT_TRUE(GainController2::Validate(config));
  return std::make_unique<GainController2>(config, sample_rate_hz,
-                                           /*num_channels=*/1);
+                                           /*num_channels=*/1,
+                                           /*use_internal_vad=*/true);
 }

 }  // namespace
@ -138,7 +139,8 @@ TEST(GainController2, CheckAdaptiveDigitalMaxOutputNoiseLevelConfig) {
 // Checks that the default config is applied.
 TEST(GainController2, ApplyDefaultConfig) {
  auto gain_controller2 = std::make_unique<GainController2>(
-      Agc2Config{}, /*sample_rate_hz=*/16000, /*num_channels=*/2);
+      Agc2Config{}, /*sample_rate_hz=*/16000, /*num_channels=*/2,
+      /*use_internal_vad=*/true);
  EXPECT_TRUE(gain_controller2.get());
 }

@ -253,7 +255,8 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
  Agc2Config config;
  config.fixed_digital.gain_db = 0.0f;
  config.adaptive_digital.enabled = true;
-  GainController2 agc2(config, kSampleRateHz, kStereo);
+  GainController2 agc2(config, kSampleRateHz, kStereo,
+                       /*use_internal_vad=*/true);

  test::InputAudioFile input_file(
      test::GetApmCaptureTestVectorFileName(kSampleRateHz),
@ -276,16 +279,16 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
                                   stream_config.num_channels(), &input_file,
                                   frame);
    // Apply a fixed gain to the input audio.
-    for (float& x : frame)
+    for (float& x : frame) {
      x *= gain;
+    }
    test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
-    // Process.
-    agc2.Process(&audio_buffer);
+    agc2.Process(/*speech_probability=*/absl::nullopt, &audio_buffer);
  }

  // Estimate the applied gain by processing a probing frame.
  SetAudioBufferSamples(/*value=*/1.0f, audio_buffer);
-  agc2.Process(&audio_buffer);
+  agc2.Process(/*speech_probability=*/absl::nullopt, &audio_buffer);
  const float applied_gain_db =
      20.0f * std::log10(audio_buffer.channels_const()[0][0]);

@ -294,5 +297,196 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
  EXPECT_NEAR(applied_gain_db, kExpectedGainDb, kToleranceDb);
 }

+// Processes a test audio file and checks that the injected speech probability
+// is ignored when the internal VAD is used.
+TEST(GainController2,
+     CheckInjectedVadProbabilityNotUsedWithAdaptiveDigitalController) {
+  constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
+  constexpr int kStereo = 2;
+
+  // Create AGC2 enabling only the adaptive digital controller.
+  Agc2Config config;
+  config.fixed_digital.gain_db = 0.0f;
+  config.adaptive_digital.enabled = true;
+  GainController2 agc2(config, kSampleRateHz, kStereo,
+                       /*use_internal_vad=*/true);
+  GainController2 agc2_reference(config, kSampleRateHz, kStereo,
+                                 /*use_internal_vad=*/true);
+
+  test::InputAudioFile input_file(
+      test::GetApmCaptureTestVectorFileName(kSampleRateHz),
+      /*loop_at_end=*/true);
+  const StreamConfig stream_config(kSampleRateHz, kStereo);
+
+  // Init buffers.
+  constexpr int kFrameDurationMs = 10;
+  std::vector<float> frame(kStereo * stream_config.num_frames());
+  AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
+                           kSampleRateHz, kStereo);
+  AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
+                                     kStereo, kSampleRateHz, kStereo);
+
+  // Simulate.
+  constexpr float kGainDb = -6.0f;
+  const float gain = std::pow(10.0f, kGainDb / 20.0f);
+  constexpr int kDurationMs = 10000;
+  constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
+  constexpr float kSpeechProbabilities[] = {1.0f, 0.3f};
+  constexpr float kEpsilon = 0.0001f;
+  bool all_samples_zero = true;
+  for (int i = 0, j = 0; i < kNumFramesToProcess; ++i, j = 1 - j) {
+    ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
+                                   stream_config.num_channels(), &input_file,
+                                   frame);
+    // Apply a fixed gain to the input audio.
+    for (float& x : frame) {
+      x *= gain;
+    }
+    test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
+    agc2.Process(kSpeechProbabilities[j], &audio_buffer);
+    test::CopyVectorToAudioBuffer(stream_config, frame,
+                                  &audio_buffer_reference);
+    agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
+
+    // Check the output buffers.
+    for (int i = 0; i < kStereo; ++i) {
+      for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
+        all_samples_zero &=
+            fabs(audio_buffer.channels_const()[i][j]) < kEpsilon;
+        EXPECT_FLOAT_EQ(audio_buffer.channels_const()[i][j],
+                        audio_buffer_reference.channels_const()[i][j]);
+      }
+    }
+  }
+  EXPECT_FALSE(all_samples_zero);
+}
+
+// Processes a test audio file and checks that the injected speech probability
+// is not ignored when the internal VAD is not used.
+TEST(GainController2,
+     CheckInjectedVadProbabilityUsedWithAdaptiveDigitalController) {
+  constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
+  constexpr int kStereo = 2;
+
+  // Create AGC2 enabling only the adaptive digital controller.
+  Agc2Config config;
+  config.fixed_digital.gain_db = 0.0f;
+  config.adaptive_digital.enabled = true;
+  GainController2 agc2(config, kSampleRateHz, kStereo,
+                       /*use_internal_vad=*/false);
+  GainController2 agc2_reference(config, kSampleRateHz, kStereo,
+                                 /*use_internal_vad=*/true);
+
+  test::InputAudioFile input_file(
+      test::GetApmCaptureTestVectorFileName(kSampleRateHz),
+      /*loop_at_end=*/true);
+  const StreamConfig stream_config(kSampleRateHz, kStereo);
+
+  // Init buffers.
+  constexpr int kFrameDurationMs = 10;
+  std::vector<float> frame(kStereo * stream_config.num_frames());
+  AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
+                           kSampleRateHz, kStereo);
+  AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
+                                     kStereo, kSampleRateHz, kStereo);
+  // Simulate.
+  constexpr float kGainDb = -6.0f;
+  const float gain = std::pow(10.0f, kGainDb / 20.0f);
+  constexpr int kDurationMs = 10000;
+  constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
+  constexpr float kSpeechProbabilities[] = {1.0f, 0.3f};
+  constexpr float kEpsilon = 0.0001f;
+  bool all_samples_zero = true;
+  bool all_samples_equal = true;
+  for (int i = 0, j = 0; i < kNumFramesToProcess; ++i, j = 1 - j) {
+    ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
+                                   stream_config.num_channels(), &input_file,
+                                   frame);
+    // Apply a fixed gain to the input audio.
+    for (float& x : frame) {
+      x *= gain;
+    }
+    test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
+    agc2.Process(kSpeechProbabilities[j], &audio_buffer);
+    test::CopyVectorToAudioBuffer(stream_config, frame,
+                                  &audio_buffer_reference);
+    agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
+    // Check the output buffers.
+    for (int i = 0; i < kStereo; ++i) {
+      for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
+        all_samples_zero &=
+            fabs(audio_buffer.channels_const()[i][j]) < kEpsilon;
+        all_samples_equal &=
+            fabs(audio_buffer.channels_const()[i][j] -
+                 audio_buffer_reference.channels_const()[i][j]) < kEpsilon;
+      }
+    }
+  }
+  EXPECT_FALSE(all_samples_zero);
+  EXPECT_FALSE(all_samples_equal);
+}
+
+// Processes a test audio file and checks that the output is equal when
+// an injected speech probability from `VoiceActivityDetectorWrapper` and
+// the speech probability computed by the internal VAD are the same.
+TEST(GainController2,
+     CheckEqualResultFromInjectedVadProbabilityWithAdaptiveDigitalController) {
+  constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
+  constexpr int kStereo = 2;
+
+  // Create AGC2 enabling only the adaptive digital controller.
+  Agc2Config config;
+  config.fixed_digital.gain_db = 0.0f;
+  config.adaptive_digital.enabled = true;
+  GainController2 agc2(config, kSampleRateHz, kStereo,
+                       /*use_internal_vad=*/false);
+  GainController2 agc2_reference(config, kSampleRateHz, kStereo,
+                                 /*use_internal_vad=*/true);
+  VoiceActivityDetectorWrapper vad(config.adaptive_digital.vad_reset_period_ms,
+                                   GetAvailableCpuFeatures(), kSampleRateHz);
+  test::InputAudioFile input_file(
+      test::GetApmCaptureTestVectorFileName(kSampleRateHz),
+      /*loop_at_end=*/true);
+  const StreamConfig stream_config(kSampleRateHz, kStereo);
+
+  // Init buffers.
+  constexpr int kFrameDurationMs = 10;
+  std::vector<float> frame(kStereo * stream_config.num_frames());
+  AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
+                           kSampleRateHz, kStereo);
+  AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
+                                     kStereo, kSampleRateHz, kStereo);
+
+  // Simulate.
+  constexpr float kGainDb = -6.0f;
+  const float gain = std::pow(10.0f, kGainDb / 20.0f);
+  constexpr int kDurationMs = 10000;
+  constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
+  for (int i = 0; i < kNumFramesToProcess; ++i) {
+    ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
+                                   stream_config.num_channels(), &input_file,
+                                   frame);
+    // Apply a fixed gain to the input audio.
+    for (float& x : frame) {
+      x *= gain;
+    }
+    test::CopyVectorToAudioBuffer(stream_config, frame,
+                                  &audio_buffer_reference);
+    agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
+    test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
+    agc2.Process(vad.Analyze(AudioFrameView<const float>(
+                     audio_buffer.channels(), audio_buffer.num_channels(),
+                     audio_buffer.num_frames())),
+                 &audio_buffer);
+    // Check the output buffer.
+    for (int i = 0; i < kStereo; ++i) {
+      for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
+        EXPECT_FLOAT_EQ(audio_buffer.channels_const()[i][j],
+                        audio_buffer_reference.channels_const()[i][j]);
+      }
+    }
+  }
+}
+
 }  // namespace test
 }  // namespace webrtc