From 530781d03f1844e739a1afe63c05ce65c180a3aa Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Fri, 25 Sep 2020 13:24:36 +0200 Subject: [PATCH] AGC2 VadWithLevel(::LevelAndProbability) renamed + injectable VAD Refactoring CL to improve names and allow to inject a VAD into `VadLevelAnalyzer` (new name for `VadWithLevel`). The injectable VAD is needed to inject a mock VAD and write better unit tests as new features are going to be added to the class. Bug: webrtc:7494 Change-Id: Ic0cea1e86a19a82533bd40fa04c061be3c44f068 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/185180 Commit-Queue: Alessio Bazzica Reviewed-by: Minyue Li Cr-Commit-Position: refs/heads/master@{#32195} --- modules/audio_processing/agc2/adaptive_agc.cc | 4 +- modules/audio_processing/agc2/adaptive_agc.h | 2 +- .../agc2/adaptive_digital_gain_applier.h | 2 +- .../adaptive_digital_gain_applier_unittest.cc | 11 +-- .../agc2/adaptive_mode_level_estimator.cc | 26 ++--- .../agc2/adaptive_mode_level_estimator.h | 2 +- .../agc2/adaptive_mode_level_estimator_agc.h | 2 +- .../adaptive_mode_level_estimator_unittest.cc | 79 +++++++++------- .../audio_processing/agc2/vad_with_level.cc | 94 +++++++++++-------- .../audio_processing/agc2/vad_with_level.h | 48 +++++----- .../agc2/vad_with_level_unittest.cc | 11 +-- 11 files changed, 158 insertions(+), 123 deletions(-) diff --git a/modules/audio_processing/agc2/adaptive_agc.cc b/modules/audio_processing/agc2/adaptive_agc.cc index a5d36089c4..816754d08d 100644 --- a/modules/audio_processing/agc2/adaptive_agc.cc +++ b/modules/audio_processing/agc2/adaptive_agc.cc @@ -47,9 +47,9 @@ void AdaptiveAgc::Process(AudioFrameView float_frame, apm_data_dumper_->DumpRaw("agc2_vad_probability", signal_with_levels.vad_result.speech_probability); apm_data_dumper_->DumpRaw("agc2_vad_rms_dbfs", - signal_with_levels.vad_result.speech_rms_dbfs); + signal_with_levels.vad_result.rms_dbfs); apm_data_dumper_->DumpRaw("agc2_vad_peak_dbfs", - signal_with_levels.vad_result.speech_peak_dbfs); + signal_with_levels.vad_result.peak_dbfs); speech_level_estimator_.UpdateEstimation(signal_with_levels.vad_result); diff --git a/modules/audio_processing/agc2/adaptive_agc.h b/modules/audio_processing/agc2/adaptive_agc.h index 16c0082ed8..82a3f0e633 100644 --- a/modules/audio_processing/agc2/adaptive_agc.h +++ b/modules/audio_processing/agc2/adaptive_agc.h @@ -33,7 +33,7 @@ class AdaptiveAgc { private: AdaptiveModeLevelEstimator speech_level_estimator_; - VadWithLevel vad_; + VadLevelAnalyzer vad_; AdaptiveDigitalGainApplier gain_applier_; ApmDataDumper* const apm_data_dumper_; NoiseLevelEstimator noise_level_estimator_; diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h index e7f07fcf06..bef7017e01 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h @@ -26,7 +26,7 @@ struct SignalWithLevels { float input_level_dbfs = -1.f; float input_noise_level_dbfs = -1.f; - VadWithLevel::LevelAndProbability vad_result; + VadLevelAnalyzer::Result vad_result; float limiter_audio_level_dbfs = -1.f; bool estimate_is_confident = false; AudioFrameView float_frame; diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc index 6e77cdac68..9e31655d3d 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc @@ -23,11 +23,13 @@ namespace { // Constants used in place of estimated noise levels. constexpr float kNoNoiseDbfs = -90.f; constexpr float kWithNoiseDbfs = -20.f; -constexpr VadWithLevel::LevelAndProbability kVadSpeech(1.f, -20.f, 0.f); +static_assert(std::is_trivially_destructible::value, + ""); +constexpr VadLevelAnalyzer::Result kVadSpeech{1.f, -20.f, 0.f}; // Runs gain applier and returns the applied gain in linear scale. float RunOnConstantLevel(int num_iterations, - VadWithLevel::LevelAndProbability vad_data, + VadLevelAnalyzer::Result vad_level, float input_level_dbfs, AdaptiveDigitalGainApplier* gain_applier) { float gain_linear = 0.f; @@ -37,7 +39,7 @@ float RunOnConstantLevel(int num_iterations, SignalWithLevels signal_with_levels(fake_audio.float_frame_view()); signal_with_levels.input_level_dbfs = input_level_dbfs; signal_with_levels.input_noise_level_dbfs = kNoNoiseDbfs; - signal_with_levels.vad_result = vad_data; + signal_with_levels.vad_result = vad_level; signal_with_levels.limiter_audio_level_dbfs = -2.f; signal_with_levels.estimate_is_confident = true; gain_applier->Process(signal_with_levels); @@ -61,9 +63,6 @@ SignalWithLevels TestSignalWithLevel(AudioFrameView float_frame) { } // namespace TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) { - static_assert( - std::is_trivially_destructible::value, - ""); ApmDataDumper apm_data_dumper(0); AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper); diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc index c59038bb1f..2f5e442aac 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc @@ -50,15 +50,15 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( apm_data_dumper_(apm_data_dumper) {} void AdaptiveModeLevelEstimator::UpdateEstimation( - const VadWithLevel::LevelAndProbability& vad_data) { - RTC_DCHECK_GT(vad_data.speech_rms_dbfs, -150.f); - RTC_DCHECK_LT(vad_data.speech_rms_dbfs, 50.f); - RTC_DCHECK_GT(vad_data.speech_peak_dbfs, -150.f); - RTC_DCHECK_LT(vad_data.speech_peak_dbfs, 50.f); - RTC_DCHECK_GE(vad_data.speech_probability, 0.f); - RTC_DCHECK_LE(vad_data.speech_probability, 1.f); + const VadLevelAnalyzer::Result& vad_level) { + RTC_DCHECK_GT(vad_level.rms_dbfs, -150.f); + RTC_DCHECK_LT(vad_level.rms_dbfs, 50.f); + RTC_DCHECK_GT(vad_level.peak_dbfs, -150.f); + RTC_DCHECK_LT(vad_level.peak_dbfs, 50.f); + RTC_DCHECK_GE(vad_level.speech_probability, 0.f); + RTC_DCHECK_LE(vad_level.speech_probability, 1.f); - if (vad_data.speech_probability < kVadConfidenceThreshold) { + if (vad_level.speech_probability < kVadConfidenceThreshold) { DebugDumpEstimate(); return; } @@ -76,22 +76,22 @@ void AdaptiveModeLevelEstimator::UpdateEstimation( AudioProcessing::Config::GainController2::LevelEstimator; switch (level_estimator_) { case LevelEstimatorType::kRms: - speech_level_dbfs = vad_data.speech_rms_dbfs; + speech_level_dbfs = vad_level.rms_dbfs; break; case LevelEstimatorType::kPeak: - speech_level_dbfs = vad_data.speech_peak_dbfs; + speech_level_dbfs = vad_level.peak_dbfs; break; } // Update speech level estimation. estimate_numerator_ = estimate_numerator_ * leak_factor + - speech_level_dbfs * vad_data.speech_probability; + speech_level_dbfs * vad_level.speech_probability; estimate_denominator_ = - estimate_denominator_ * leak_factor + vad_data.speech_probability; + estimate_denominator_ * leak_factor + vad_level.speech_probability; last_estimate_with_offset_dbfs_ = estimate_numerator_ / estimate_denominator_; if (use_saturation_protector_) { - saturation_protector_.UpdateMargin(vad_data.speech_peak_dbfs, + saturation_protector_.UpdateMargin(vad_level.peak_dbfs, last_estimate_with_offset_dbfs_); DebugDumpEstimate(); } diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h index 737a189ac9..a02641c50c 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h @@ -40,7 +40,7 @@ class AdaptiveModeLevelEstimator { bool use_saturation_protector, float initial_saturation_margin_db, float extra_saturation_margin_db); - void UpdateEstimation(const VadWithLevel::LevelAndProbability& vad_data); + void UpdateEstimation(const VadLevelAnalyzer::Result& vad_level); float LatestLevelEstimate() const; void Reset(); bool LevelEstimationIsConfident() const { diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_agc.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator_agc.h index 6d12339888..bc6fa843b5 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator_agc.h +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_agc.h @@ -43,7 +43,7 @@ class AdaptiveModeLevelEstimatorAgc : public Agc { static constexpr int kDefaultAgc2LevelHeadroomDbfs = -1; int32_t time_in_ms_since_last_estimate_ = 0; AdaptiveModeLevelEstimator level_estimator_; - VadWithLevel agc2_vad_; + VadLevelAnalyzer agc2_vad_; float latest_voice_probability_ = 0.f; }; } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc index d761c5bb76..be1fc9482e 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc @@ -22,11 +22,14 @@ namespace { constexpr float kInitialSaturationMarginDb = 20.f; constexpr float kExtraSaturationMarginDb = 2.f; +constexpr float kMinSpeechProbability = 0.f; +constexpr float kMaxSpeechProbability = 1.f; + void RunOnConstantLevel(int num_iterations, - VadWithLevel::LevelAndProbability vad_data, + const VadLevelAnalyzer::Result& vad_level, AdaptiveModeLevelEstimator& level_estimator) { for (int i = 0; i < num_iterations; ++i) { - level_estimator.UpdateEstimation(vad_data); // By copy + level_estimator.UpdateEstimation(vad_level); } } @@ -49,8 +52,9 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, EstimatorShouldNotCrash) { TestLevelEstimator level_estimator; - VadWithLevel::LevelAndProbability vad_data(1.f, -20.f, -10.f); - level_estimator.estimator->UpdateEstimation(vad_data); + VadLevelAnalyzer::Result vad_level{kMaxSpeechProbability, /*rms_dbfs=*/-20.f, + /*peak_dbfs=*/-10.f}; + level_estimator.estimator->UpdateEstimation(vad_level); static_cast(level_estimator.estimator->LatestLevelEstimate()); } @@ -58,11 +62,12 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) { TestLevelEstimator level_estimator; constexpr float kSpeechPeakDbfs = -15.f; - RunOnConstantLevel( - 100, - VadWithLevel::LevelAndProbability( - 1.f, kSpeechPeakDbfs - kInitialSaturationMarginDb, kSpeechPeakDbfs), - *level_estimator.estimator); + RunOnConstantLevel(100, + VadLevelAnalyzer::Result{kMaxSpeechProbability, + /*rms_dbfs=*/kSpeechPeakDbfs - + kInitialSaturationMarginDb, + kSpeechPeakDbfs}, + *level_estimator.estimator); EXPECT_NEAR(level_estimator.estimator->LatestLevelEstimate() - kExtraSaturationMarginDb, @@ -75,17 +80,20 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, // Run for one second of fake audio. constexpr float kSpeechRmsDbfs = -25.f; - RunOnConstantLevel( - 100, - VadWithLevel::LevelAndProbability( - 1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs), - *level_estimator.estimator); + RunOnConstantLevel(100, + VadLevelAnalyzer::Result{kMaxSpeechProbability, + /*rms_dbfs=*/kSpeechRmsDbfs - + kInitialSaturationMarginDb, + /*peak_dbfs=*/kSpeechRmsDbfs}, + *level_estimator.estimator); // Run for one more second, but mark as not speech. constexpr float kNoiseRmsDbfs = 0.f; - RunOnConstantLevel( - 100, VadWithLevel::LevelAndProbability(0.f, kNoiseRmsDbfs, kNoiseRmsDbfs), - *level_estimator.estimator); + RunOnConstantLevel(100, + VadLevelAnalyzer::Result{kMinSpeechProbability, + /*rms_dbfs=*/kNoiseRmsDbfs, + /*peak_dbfs=*/kNoiseRmsDbfs}, + *level_estimator.estimator); // Level should not have changed. EXPECT_NEAR(level_estimator.estimator->LatestLevelEstimate() - @@ -100,9 +108,10 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) { constexpr float kInitialSpeechRmsDbfs = -30.f; RunOnConstantLevel( kFullBufferSizeMs / kFrameDurationMs, - VadWithLevel::LevelAndProbability( - 1.f, kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, - kInitialSpeechRmsDbfs), + VadLevelAnalyzer::Result{ + kMaxSpeechProbability, + /*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, + /*peak_dbfs=*/kInitialSpeechRmsDbfs}, *level_estimator.estimator); // Run for one half 'window size' interval. This should not be enough to @@ -110,12 +119,13 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) { constexpr float kDifferentSpeechRmsDbfs = -10.f; // It should at most differ by 25% after one half 'window size' interval. const float kMaxDifferenceDb = - 0.25 * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs); + 0.25f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs); RunOnConstantLevel( static_cast(kFullBufferSizeMs / kFrameDurationMs / 2), - VadWithLevel::LevelAndProbability( - 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, - kDifferentSpeechRmsDbfs), + VadLevelAnalyzer::Result{ + kMaxSpeechProbability, + /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, + /*peak_dbfs=*/kDifferentSpeechRmsDbfs}, *level_estimator.estimator); EXPECT_GT(std::abs(kDifferentSpeechRmsDbfs - level_estimator.estimator->LatestLevelEstimate()), @@ -124,9 +134,10 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) { // Run for some more time. Afterwards, we should have adapted. RunOnConstantLevel( static_cast(3 * kFullBufferSizeMs / kFrameDurationMs), - VadWithLevel::LevelAndProbability( - 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, - kDifferentSpeechRmsDbfs), + VadLevelAnalyzer::Result{ + kMaxSpeechProbability, + /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, + /*peak_dbfs=*/kDifferentSpeechRmsDbfs}, *level_estimator.estimator); EXPECT_NEAR(level_estimator.estimator->LatestLevelEstimate() - kExtraSaturationMarginDb, @@ -142,9 +153,10 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, constexpr float kInitialSpeechRmsDbfs = -30.f; RunOnConstantLevel( kFullBufferSizeMs / kFrameDurationMs, - VadWithLevel::LevelAndProbability( - 1.f, kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, - kInitialSpeechRmsDbfs), + VadLevelAnalyzer::Result{ + kMaxSpeechProbability, + /*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, + /*peak_dbfs=*/kInitialSpeechRmsDbfs}, *level_estimator.estimator); constexpr float kDifferentSpeechRmsDbfs = -10.f; @@ -153,9 +165,10 @@ TEST(AutomaticGainController2AdaptiveModeLevelEstimator, RunOnConstantLevel( kFullBufferSizeMs / kFrameDurationMs / 2, - VadWithLevel::LevelAndProbability( - 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, - kDifferentSpeechRmsDbfs), + VadLevelAnalyzer::Result{ + kMaxSpeechProbability, + /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, + /*peak_dbfs=*/kDifferentSpeechRmsDbfs}, *level_estimator.estimator); // The level should be close to 'kDifferentSpeechRmsDbfs'. diff --git a/modules/audio_processing/agc2/vad_with_level.cc b/modules/audio_processing/agc2/vad_with_level.cc index d4ec2ced98..d35fbef90a 100644 --- a/modules/audio_processing/agc2/vad_with_level.cc +++ b/modules/audio_processing/agc2/vad_with_level.cc @@ -16,55 +16,73 @@ #include "api/array_view.h" #include "common_audio/include/audio_util.h" +#include "common_audio/resampler/include/push_resampler.h" #include "modules/audio_processing/agc2/rnn_vad/common.h" +#include "modules/audio_processing/agc2/rnn_vad/features_extraction.h" +#include "modules/audio_processing/agc2/rnn_vad/rnn.h" +#include "rtc_base/checks.h" namespace webrtc { - namespace { -float ProcessForPeak(AudioFrameView frame) { - float current_max = 0; - for (const auto& x : frame.channel(0)) { - current_max = std::max(std::fabs(x), current_max); - } - return current_max; -} -float ProcessForRms(AudioFrameView frame) { - float rms = 0; - for (const auto& x : frame.channel(0)) { - rms += x * x; +using VoiceActivityDetector = VadLevelAnalyzer::VoiceActivityDetector; + +// Default VAD that combines a resampler and the RNN VAD. +// Computes the speech probability on the first channel. +class Vad : public VoiceActivityDetector { + public: + Vad() = default; + Vad(const Vad&) = delete; + Vad& operator=(const Vad&) = delete; + ~Vad() = default; + + float ComputeProbability(AudioFrameView frame) override { + // The source number of channels is 1, because we always use the 1st + // channel. + resampler_.InitializeIfNeeded( + /*sample_rate_hz=*/static_cast(frame.samples_per_channel() * 100), + rnn_vad::kSampleRate24kHz, + /*num_channels=*/1); + + std::array work_frame; + // Feed the 1st channel to the resampler. + resampler_.Resample(frame.channel(0).data(), frame.samples_per_channel(), + work_frame.data(), rnn_vad::kFrameSize10ms24kHz); + + std::array feature_vector; + const bool is_silence = features_extractor_.CheckSilenceComputeFeatures( + work_frame, feature_vector); + return rnn_vad_.ComputeVadProbability(feature_vector, is_silence); } - return std::sqrt(rms / frame.samples_per_channel()); -} + + private: + PushResampler resampler_; + rnn_vad::FeaturesExtractor features_extractor_; + rnn_vad::RnnBasedVad rnn_vad_; +}; + } // namespace -VadWithLevel::VadWithLevel() = default; -VadWithLevel::~VadWithLevel() = default; +VadLevelAnalyzer::VadLevelAnalyzer() : vad_(std::make_unique()) {} -VadWithLevel::LevelAndProbability VadWithLevel::AnalyzeFrame( - AudioFrameView frame) { - SetSampleRate(static_cast(frame.samples_per_channel() * 100)); - std::array work_frame; - // Feed the 1st channel to the resampler. - resampler_.Resample(frame.channel(0).data(), frame.samples_per_channel(), - work_frame.data(), rnn_vad::kFrameSize10ms24kHz); - - std::array feature_vector; - - const bool is_silence = features_extractor_.CheckSilenceComputeFeatures( - work_frame, feature_vector); - const float vad_probability = - rnn_vad_.ComputeVadProbability(feature_vector, is_silence); - return LevelAndProbability(vad_probability, - FloatS16ToDbfs(ProcessForRms(frame)), - FloatS16ToDbfs(ProcessForPeak(frame))); +VadLevelAnalyzer::VadLevelAnalyzer(std::unique_ptr vad) + : vad_(std::move(vad)) { + RTC_DCHECK(vad_); } -void VadWithLevel::SetSampleRate(int sample_rate_hz) { - // The source number of channels in 1, because we always use the 1st - // channel. - resampler_.InitializeIfNeeded(sample_rate_hz, rnn_vad::kSampleRate24kHz, - 1 /* num_channels */); +VadLevelAnalyzer::~VadLevelAnalyzer() = default; + +VadLevelAnalyzer::Result VadLevelAnalyzer::AnalyzeFrame( + AudioFrameView frame) { + float peak = 0.f; + float rms = 0.f; + for (const auto& x : frame.channel(0)) { + peak = std::max(std::fabs(x), peak); + rms += x * x; + } + return {vad_->ComputeProbability(frame), + FloatS16ToDbfs(std::sqrt(rms / frame.samples_per_channel())), + FloatS16ToDbfs(peak)}; } } // namespace webrtc diff --git a/modules/audio_processing/agc2/vad_with_level.h b/modules/audio_processing/agc2/vad_with_level.h index b0ad868d4b..56eb79faf7 100644 --- a/modules/audio_processing/agc2/vad_with_level.h +++ b/modules/audio_processing/agc2/vad_with_level.h @@ -11,36 +11,42 @@ #ifndef MODULES_AUDIO_PROCESSING_AGC2_VAD_WITH_LEVEL_H_ #define MODULES_AUDIO_PROCESSING_AGC2_VAD_WITH_LEVEL_H_ -#include "common_audio/resampler/include/push_resampler.h" -#include "modules/audio_processing/agc2/rnn_vad/features_extraction.h" -#include "modules/audio_processing/agc2/rnn_vad/rnn.h" +#include + #include "modules/audio_processing/include/audio_frame_view.h" namespace webrtc { -class VadWithLevel { + +// Class to analyze voice activity and audio levels. +class VadLevelAnalyzer { public: - struct LevelAndProbability { - constexpr LevelAndProbability(float prob, float rms, float peak) - : speech_probability(prob), - speech_rms_dbfs(rms), - speech_peak_dbfs(peak) {} - LevelAndProbability() = default; - float speech_probability = 0; - float speech_rms_dbfs = 0; // Root mean square in decibels to full-scale. - float speech_peak_dbfs = 0; + struct Result { + float speech_probability; // Range: [0, 1]. + float rms_dbfs; // Root mean square power (dBFS). + float peak_dbfs; // Peak power (dBFS). }; - VadWithLevel(); - ~VadWithLevel(); + // Voice Activity Detector (VAD) interface. + class VoiceActivityDetector { + public: + virtual ~VoiceActivityDetector() = default; + // Analyzes an audio frame and returns the speech probability. + virtual float ComputeProbability(AudioFrameView frame) = 0; + }; - LevelAndProbability AnalyzeFrame(AudioFrameView frame); + // Ctor. Uses the default VAD. + VadLevelAnalyzer(); + // Ctor. Uses a custom `vad`. + explicit VadLevelAnalyzer(std::unique_ptr vad); + VadLevelAnalyzer(const VadLevelAnalyzer&) = delete; + VadLevelAnalyzer& operator=(const VadLevelAnalyzer&) = delete; + ~VadLevelAnalyzer(); + + // Computes the speech probability and the level for `frame`. + Result AnalyzeFrame(AudioFrameView frame); private: - void SetSampleRate(int sample_rate_hz); - - rnn_vad::RnnBasedVad rnn_vad_; - rnn_vad::FeaturesExtractor features_extractor_; - PushResampler resampler_; + std::unique_ptr vad_; }; } // namespace webrtc diff --git a/modules/audio_processing/agc2/vad_with_level_unittest.cc b/modules/audio_processing/agc2/vad_with_level_unittest.cc index f9aee62ba9..5017e8369e 100644 --- a/modules/audio_processing/agc2/vad_with_level_unittest.cc +++ b/modules/audio_processing/agc2/vad_with_level_unittest.cc @@ -13,7 +13,7 @@ #include "rtc_base/gunit.h" namespace webrtc { -namespace test { +namespace { TEST(AutomaticGainController2VadWithLevelEstimator, PeakLevelGreaterThanRmsLevel) { @@ -28,13 +28,12 @@ TEST(AutomaticGainController2VadWithLevelEstimator, AudioFrameView frame_view(&channel0, 1, frame.size()); // Compute audio frame levels (the VAD result is ignored). - VadWithLevel vad_with_level; - auto levels_and_vad_prob = vad_with_level.AnalyzeFrame(frame_view); + VadLevelAnalyzer analyzer; + auto levels_and_vad_prob = analyzer.AnalyzeFrame(frame_view); // Compare peak and RMS levels. - EXPECT_LT(levels_and_vad_prob.speech_rms_dbfs, - levels_and_vad_prob.speech_peak_dbfs); + EXPECT_LT(levels_and_vad_prob.rms_dbfs, levels_and_vad_prob.peak_dbfs); } -} // namespace test +} // namespace } // namespace webrtc