APM Transient Suppressor (TS): integrate VoiceProbabilityDelayUnit

This CL adds a component in the TS implementation to return a delayed version of the voice probability values observed when `Suppress()` is called. That is needed in order to temporally align the voice probability values to the processed audio since TS adds algorithmic delay. Bug: webrtc:13663 Change-Id: I5041ace3939d2ce7ba084ae703428e66f1aa06be Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/255860 Reviewed-by: Hanna Silen <silen@webrtc.org> Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/main@{#36496}
2022-04-08 11:22:36 +02:00
parent 26b23b8fcc
commit 7efe5332f2
5 changed files with 133 additions and 36 deletions
--- a/modules/audio_processing/transient/transient_suppressor_unittest.cc
+++ b/modules/audio_processing/transient/transient_suppressor_unittest.cc
@ -10,21 +10,37 @@

 #include "modules/audio_processing/transient/transient_suppressor.h"

+#include <vector>
+
+#include "absl/types/optional.h"
 #include "modules/audio_processing/transient/common.h"
 #include "modules/audio_processing/transient/transient_suppressor_impl.h"
 #include "test/gtest.h"

 namespace webrtc {
+namespace {
+constexpr int kMono = 1;

-class TransientSuppressorImplTest
+// Returns the index of the first non-zero sample in `samples` or an unspecified
+// value if no value is zero.
+absl::optional<int> FindFirstNonZeroSample(const std::vector<float>& samples) {
+  for (size_t i = 0; i < samples.size(); ++i) {
+    if (samples[i] != 0.0f) {
+      return i;
+    }
+  }
+  return absl::nullopt;
+}
+
+}  // namespace
+
+class TransientSuppressorVadModeParametrization
    : public ::testing::TestWithParam<TransientSuppressor::VadMode> {};

-TEST_P(TransientSuppressorImplTest,
+TEST_P(TransientSuppressorVadModeParametrization,
       TypingDetectionLogicWorksAsExpectedForMono) {
-  static const int kNumChannels = 1;
-
  TransientSuppressorImpl ts(GetParam(), ts::kSampleRate16kHz,
-                             ts::kSampleRate16kHz, kNumChannels);
+                             ts::kSampleRate16kHz, kMono);

  // Each key-press enables detection.
  EXPECT_FALSE(ts.detection_enabled_);
@ -88,10 +104,72 @@ TEST_P(TransientSuppressorImplTest,
 }

 INSTANTIATE_TEST_SUITE_P(
-    ,
    TransientSuppressorImplTest,
+    TransientSuppressorVadModeParametrization,
    ::testing::Values(TransientSuppressor::VadMode::kDefault,
                      TransientSuppressor::VadMode::kRnnVad,
                      TransientSuppressor::VadMode::kNoVad));

+class TransientSuppressorSampleRateParametrization
+    : public ::testing::TestWithParam<int> {};
+
+// Checks that voice probability and processed audio data are temporally aligned
+// after `Suppress()` is called.
+TEST_P(TransientSuppressorSampleRateParametrization,
+       CheckAudioAndVoiceProbabilityTemporallyAligned) {
+  const int sample_rate_hz = GetParam();
+  TransientSuppressorImpl ts(TransientSuppressor::VadMode::kDefault,
+                             sample_rate_hz,
+                             /*detection_rate_hz=*/sample_rate_hz, kMono);
+
+  const int frame_size = sample_rate_hz * ts::kChunkSizeMs / 1000;
+  std::vector<float> frame(frame_size);
+
+  constexpr int kMaxAttempts = 3;
+  for (int i = 0; i < kMaxAttempts; ++i) {
+    SCOPED_TRACE(i);
+
+    // Call `Suppress()` on frames of non-zero audio samples.
+    std::fill(frame.begin(), frame.end(), 1000.0f);
+    float delayed_voice_probability = ts.Suppress(
+        frame.data(), frame.size(), kMono, /*detection_data=*/nullptr,
+        /*detection_length=*/frame_size, /*reference_data=*/nullptr,
+        /*reference_length=*/frame_size, /*voice_probability=*/1.0f,
+        /*key_pressed=*/false);
+
+    // Detect the algorithmic delay of `TransientSuppressorImpl`.
+    absl::optional<int> frame_delay = FindFirstNonZeroSample(frame);
+
+    // Check that the delayed voice probability is delayed according to the
+    // measured delay.
+    if (frame_delay.has_value()) {
+      if (*frame_delay == 0) {
+        // When the delay is a multiple integer of the frame duration,
+        // `Suppress()` returns a copy of a previously observed voice
+        // probability value.
+        EXPECT_EQ(delayed_voice_probability, 1.0f);
+      } else {
+        // Instead, when the delay is fractional, `Suppress()` returns an
+        // interpolated value. Since the exact value depends on the
+        // interpolation method, we only check that the delayed voice
+        // probability is not zero as it must converge towards the previoulsy
+        // observed value.
+        EXPECT_GT(delayed_voice_probability, 0.0f);
+      }
+      break;
+    } else {
+      // The algorithmic delay is longer than the duration of a single frame.
+      // Until the delay is detected, the delayed voice probability is zero.
+      EXPECT_EQ(delayed_voice_probability, 0.0f);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(TransientSuppressorImplTest,
+                         TransientSuppressorSampleRateParametrization,
+                         ::testing::Values(ts::kSampleRate8kHz,
+                                           ts::kSampleRate16kHz,
+                                           ts::kSampleRate32kHz,
+                                           ts::kSampleRate48kHz));
+
 }  // namespace webrtc