From 4db667be7450e754666f0bf2d33d9925f50fb4f8 Mon Sep 17 00:00:00 2001 From: Sam Zackrisson Date: Fri, 21 Dec 2018 16:29:27 +0100 Subject: [PATCH] Add private voice detection instance to replace public voice detector This adds a second (!) VoiceDetection instance in APM, activated via webrtc::AudioProcessing::Config and which reports its values in the webrtc::AudioProcessingStats struct. The alternative is to reuse the existing instance, but that would require adding a proxy interface returned by AudioProcessing::voice_detection() to update the internal config of AudioProcessingImpl when calling voice_detection()->Enable(). Complexity-wise, no reasonable client will enable both interfaces simultaneously, so the footprint is negligible. Bug: webrtc:9947 Change-Id: I7d8e28b9bf06abab8f9c6822424bdb9d803b987d Reviewed-on: https://webrtc-review.googlesource.com/c/115243 Commit-Queue: Sam Zackrisson Reviewed-by: Ivo Creusen Cr-Commit-Position: refs/heads/master@{#26101} --- .../audio_processing/audio_processing_impl.cc | 30 ++++++++++++- .../audio_processing/audio_processing_impl.h | 2 + .../audio_processing_unittest.cc | 43 +++++++++++++++++-- .../include/audio_processing.h | 5 +++ .../include/audio_processing_statistics.h | 6 +++ 5 files changed, 82 insertions(+), 4 deletions(-) diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc index 2937c0680b..c0058c73a8 100644 --- a/modules/audio_processing/audio_processing_impl.cc +++ b/modules/audio_processing/audio_processing_impl.cc @@ -140,6 +140,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update( bool pre_amplifier_enabled, bool echo_controller_enabled, bool voice_activity_detector_enabled, + bool private_voice_detector_enabled, bool level_estimator_enabled, bool transient_suppressor_enabled) { bool changed = false; @@ -159,6 +160,8 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update( changed |= (level_estimator_enabled != level_estimator_enabled_); changed |= (voice_activity_detector_enabled != voice_activity_detector_enabled_); + changed |= + (private_voice_detector_enabled != private_voice_detector_enabled_); changed |= (transient_suppressor_enabled != transient_suppressor_enabled_); if (changed) { high_pass_filter_enabled_ = high_pass_filter_enabled; @@ -172,6 +175,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update( echo_controller_enabled_ = echo_controller_enabled; level_estimator_enabled_ = level_estimator_enabled; voice_activity_detector_enabled_ = voice_activity_detector_enabled; + private_voice_detector_enabled_ = private_voice_detector_enabled; transient_suppressor_enabled_ = transient_suppressor_enabled; } @@ -182,7 +186,8 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update( bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive() const { - return CaptureMultiBandProcessingActive() || voice_activity_detector_enabled_; + return CaptureMultiBandProcessingActive() || + voice_activity_detector_enabled_ || private_voice_detector_enabled_; } bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive() @@ -260,6 +265,7 @@ struct AudioProcessingImpl::ApmPrivateSubmodules { std::unique_ptr pre_amplifier; std::unique_ptr capture_analyzer; std::unique_ptr output_level_estimator; + std::unique_ptr voice_detector; }; AudioProcessingBuilder::AudioProcessingBuilder() = default; @@ -540,6 +546,10 @@ int AudioProcessingImpl::InitializeLocked() { public_submodules_->noise_suppression->Initialize(num_proc_channels(), proc_sample_rate_hz()); public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz()); + if (private_submodules_->voice_detector) { + private_submodules_->voice_detector->Initialize( + proc_split_sample_rate_hz()); + } public_submodules_->level_estimator->Initialize(); InitializeResidualEchoDetector(); InitializeEchoController(); @@ -681,6 +691,16 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) { new LevelEstimatorImpl(&crit_capture_)); private_submodules_->output_level_estimator->Enable(true); } + + if (config_.voice_detection.enabled && !private_submodules_->voice_detector) { + private_submodules_->voice_detector.reset( + new VoiceDetectionImpl(&crit_capture_)); + private_submodules_->voice_detector->Enable(true); + private_submodules_->voice_detector->set_likelihood( + VoiceDetection::kVeryLowLikelihood); + private_submodules_->voice_detector->Initialize( + proc_split_sample_rate_hz()); + } } void AudioProcessingImpl::SetExtraOptions(const webrtc::Config& config) { @@ -1285,6 +1305,13 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { } public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer); + if (config_.voice_detection.enabled) { + private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer); + capture_.stats.voice_detected = + private_submodules_->voice_detector->stream_has_voice(); + } else { + capture_.stats.voice_detected = absl::nullopt; + } if (constants_.use_experimental_agc && public_submodules_->gain_control->is_enabled() && @@ -1695,6 +1722,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() { config_.gain_controller2.enabled, config_.pre_amplifier.enabled, capture_nonlocked_.echo_controller_enabled, public_submodules_->voice_detection->is_enabled(), + config_.voice_detection.enabled, public_submodules_->level_estimator->is_enabled(), capture_.transient_suppressor_enabled); } diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h index 2f946c5e13..815cc9549d 100644 --- a/modules/audio_processing/audio_processing_impl.h +++ b/modules/audio_processing/audio_processing_impl.h @@ -181,6 +181,7 @@ class AudioProcessingImpl : public AudioProcessing { bool pre_amplifier_enabled, bool echo_controller_enabled, bool voice_activity_detector_enabled, + bool private_voice_detector_enabled, bool level_estimator_enabled, bool transient_suppressor_enabled); bool CaptureMultiBandSubModulesActive() const; @@ -207,6 +208,7 @@ class AudioProcessingImpl : public AudioProcessing { bool echo_controller_enabled_ = false; bool level_estimator_enabled_ = false; bool voice_activity_detector_enabled_ = false; + bool private_voice_detector_enabled_ = false; bool transient_suppressor_enabled_ = false; bool first_update_ = true; }; diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc index d01333a84b..5bd2faef83 100644 --- a/modules/audio_processing/audio_processing_unittest.cc +++ b/modules/audio_processing/audio_processing_unittest.cc @@ -2696,7 +2696,7 @@ TEST(MAYBE_ApmStatistics, AEC2EnabledTest) { // Set up an audioframe. AudioFrame frame; frame.num_channels_ = 1; - SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz); + SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz); // Fill the audio frame with a sawtooth pattern. int16_t* ptr = frame.mutable_data(); @@ -2755,7 +2755,7 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) { // Set up an audioframe. AudioFrame frame; frame.num_channels_ = 1; - SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz); + SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz); // Fill the audio frame with a sawtooth pattern. int16_t* ptr = frame.mutable_data(); @@ -2809,7 +2809,7 @@ TEST(ApmStatistics, ReportOutputRmsDbfs) { // Set up an audioframe. AudioFrame frame; frame.num_channels_ = 1; - SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz); + SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz); // Fill the audio frame with a sawtooth pattern. int16_t* ptr = frame.mutable_data(); @@ -2838,4 +2838,41 @@ TEST(ApmStatistics, ReportOutputRmsDbfs) { EXPECT_EQ(apm->ProcessStream(&frame), 0); EXPECT_FALSE(apm->GetStatistics(false).output_rms_dbfs); } + +TEST(ApmStatistics, ReportHasVoice) { + ProcessingConfig processing_config = { + {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}}; + AudioProcessing::Config config; + + // Set up an audioframe. + AudioFrame frame; + frame.num_channels_ = 1; + SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz); + + // Fill the audio frame with a sawtooth pattern. + int16_t* ptr = frame.mutable_data(); + for (size_t i = 0; i < frame.kMaxDataSizeSamples; i++) { + ptr[i] = 10000 * ((i % 3) - 1); + } + + std::unique_ptr apm(AudioProcessingBuilder().Create()); + apm->Initialize(processing_config); + + // If not enabled, no metric should be reported. + EXPECT_EQ(apm->ProcessStream(&frame), 0); + EXPECT_FALSE(apm->GetStatistics(false).voice_detected); + + // If enabled, metrics should be reported. + config.voice_detection.enabled = true; + apm->ApplyConfig(config); + EXPECT_EQ(apm->ProcessStream(&frame), 0); + auto stats = apm->GetStatistics(false); + EXPECT_TRUE(stats.voice_detected); + + // If re-disabled, the value is again not reported. + config.voice_detection.enabled = false; + apm->ApplyConfig(config); + EXPECT_EQ(apm->ProcessStream(&frame), 0); + EXPECT_FALSE(apm->GetStatistics(false).voice_detected); +} } // namespace webrtc diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h index df51313229..429816baca 100644 --- a/modules/audio_processing/include/audio_processing.h +++ b/modules/audio_processing/include/audio_processing.h @@ -288,6 +288,11 @@ class AudioProcessing : public rtc::RefCountInterface { bool enabled = false; } level_estimation; + // Enables reporting of |has_voice| in webrtc::AudioProcessingStats. + struct VoiceDetection { + bool enabled = false; + } voice_detection; + // Explicit copy assignment implementation to avoid issues with memory // sanitizer complaints in case of self-assignment. // TODO(peah): Add buildflag to ensure that this is only included for memory diff --git a/modules/audio_processing/include/audio_processing_statistics.h b/modules/audio_processing/include/audio_processing_statistics.h index 683db052e6..87babee241 100644 --- a/modules/audio_processing/include/audio_processing_statistics.h +++ b/modules/audio_processing/include/audio_processing_statistics.h @@ -32,6 +32,12 @@ struct RTC_EXPORT AudioProcessingStats { // Only reported if level estimation is enabled in AudioProcessing::Config. absl::optional output_rms_dbfs; + // True if voice is detected in the last capture frame, after processing. + // It is conservative in flagging audio as speech, with low likelihood of + // incorrectly flagging a frame as voice. + // Only reported if voice detection is enabled in AudioProcessing::Config. + absl::optional voice_detected; + // AEC Statistics. // ERL = 10log_10(P_far / P_echo) absl::optional echo_return_loss;