Add private voice detection instance to replace public voice detector

This adds a second (!) VoiceDetection instance in APM, activated via webrtc::AudioProcessing::Config and which reports its values in the webrtc::AudioProcessingStats struct.

The alternative is to reuse the existing instance, but that would require adding a proxy interface returned by AudioProcessing::voice_detection() to update the internal config of AudioProcessingImpl when calling voice_detection()->Enable().

Complexity-wise, no reasonable client will enable both interfaces simultaneously, so the footprint is negligible.

Bug: webrtc:9947
Change-Id: I7d8e28b9bf06abab8f9c6822424bdb9d803b987d
Reviewed-on: https://webrtc-review.googlesource.com/c/115243
Commit-Queue: Sam Zackrisson <saza@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#26101}
This commit is contained in:
Sam Zackrisson
2018-12-21 16:29:27 +01:00
committed by Commit Bot
parent d0fce0b1ec
commit 4db667be74
5 changed files with 82 additions and 4 deletions

View File

@ -140,6 +140,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
bool pre_amplifier_enabled, bool pre_amplifier_enabled,
bool echo_controller_enabled, bool echo_controller_enabled,
bool voice_activity_detector_enabled, bool voice_activity_detector_enabled,
bool private_voice_detector_enabled,
bool level_estimator_enabled, bool level_estimator_enabled,
bool transient_suppressor_enabled) { bool transient_suppressor_enabled) {
bool changed = false; bool changed = false;
@ -159,6 +160,8 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
changed |= (level_estimator_enabled != level_estimator_enabled_); changed |= (level_estimator_enabled != level_estimator_enabled_);
changed |= changed |=
(voice_activity_detector_enabled != voice_activity_detector_enabled_); (voice_activity_detector_enabled != voice_activity_detector_enabled_);
changed |=
(private_voice_detector_enabled != private_voice_detector_enabled_);
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_); changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
if (changed) { if (changed) {
high_pass_filter_enabled_ = high_pass_filter_enabled; high_pass_filter_enabled_ = high_pass_filter_enabled;
@ -172,6 +175,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
echo_controller_enabled_ = echo_controller_enabled; echo_controller_enabled_ = echo_controller_enabled;
level_estimator_enabled_ = level_estimator_enabled; level_estimator_enabled_ = level_estimator_enabled;
voice_activity_detector_enabled_ = voice_activity_detector_enabled; voice_activity_detector_enabled_ = voice_activity_detector_enabled;
private_voice_detector_enabled_ = private_voice_detector_enabled;
transient_suppressor_enabled_ = transient_suppressor_enabled; transient_suppressor_enabled_ = transient_suppressor_enabled;
} }
@ -182,7 +186,8 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive() bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive()
const { const {
return CaptureMultiBandProcessingActive() || voice_activity_detector_enabled_; return CaptureMultiBandProcessingActive() ||
voice_activity_detector_enabled_ || private_voice_detector_enabled_;
} }
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive() bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive()
@ -260,6 +265,7 @@ struct AudioProcessingImpl::ApmPrivateSubmodules {
std::unique_ptr<GainApplier> pre_amplifier; std::unique_ptr<GainApplier> pre_amplifier;
std::unique_ptr<CustomAudioAnalyzer> capture_analyzer; std::unique_ptr<CustomAudioAnalyzer> capture_analyzer;
std::unique_ptr<LevelEstimatorImpl> output_level_estimator; std::unique_ptr<LevelEstimatorImpl> output_level_estimator;
std::unique_ptr<VoiceDetectionImpl> voice_detector;
}; };
AudioProcessingBuilder::AudioProcessingBuilder() = default; AudioProcessingBuilder::AudioProcessingBuilder() = default;
@ -540,6 +546,10 @@ int AudioProcessingImpl::InitializeLocked() {
public_submodules_->noise_suppression->Initialize(num_proc_channels(), public_submodules_->noise_suppression->Initialize(num_proc_channels(),
proc_sample_rate_hz()); proc_sample_rate_hz());
public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz()); public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz());
if (private_submodules_->voice_detector) {
private_submodules_->voice_detector->Initialize(
proc_split_sample_rate_hz());
}
public_submodules_->level_estimator->Initialize(); public_submodules_->level_estimator->Initialize();
InitializeResidualEchoDetector(); InitializeResidualEchoDetector();
InitializeEchoController(); InitializeEchoController();
@ -681,6 +691,16 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
new LevelEstimatorImpl(&crit_capture_)); new LevelEstimatorImpl(&crit_capture_));
private_submodules_->output_level_estimator->Enable(true); private_submodules_->output_level_estimator->Enable(true);
} }
if (config_.voice_detection.enabled && !private_submodules_->voice_detector) {
private_submodules_->voice_detector.reset(
new VoiceDetectionImpl(&crit_capture_));
private_submodules_->voice_detector->Enable(true);
private_submodules_->voice_detector->set_likelihood(
VoiceDetection::kVeryLowLikelihood);
private_submodules_->voice_detector->Initialize(
proc_split_sample_rate_hz());
}
} }
void AudioProcessingImpl::SetExtraOptions(const webrtc::Config& config) { void AudioProcessingImpl::SetExtraOptions(const webrtc::Config& config) {
@ -1285,6 +1305,13 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
} }
public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer); public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer);
if (config_.voice_detection.enabled) {
private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
capture_.stats.voice_detected =
private_submodules_->voice_detector->stream_has_voice();
} else {
capture_.stats.voice_detected = absl::nullopt;
}
if (constants_.use_experimental_agc && if (constants_.use_experimental_agc &&
public_submodules_->gain_control->is_enabled() && public_submodules_->gain_control->is_enabled() &&
@ -1695,6 +1722,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
config_.gain_controller2.enabled, config_.pre_amplifier.enabled, config_.gain_controller2.enabled, config_.pre_amplifier.enabled,
capture_nonlocked_.echo_controller_enabled, capture_nonlocked_.echo_controller_enabled,
public_submodules_->voice_detection->is_enabled(), public_submodules_->voice_detection->is_enabled(),
config_.voice_detection.enabled,
public_submodules_->level_estimator->is_enabled(), public_submodules_->level_estimator->is_enabled(),
capture_.transient_suppressor_enabled); capture_.transient_suppressor_enabled);
} }

View File

@ -181,6 +181,7 @@ class AudioProcessingImpl : public AudioProcessing {
bool pre_amplifier_enabled, bool pre_amplifier_enabled,
bool echo_controller_enabled, bool echo_controller_enabled,
bool voice_activity_detector_enabled, bool voice_activity_detector_enabled,
bool private_voice_detector_enabled,
bool level_estimator_enabled, bool level_estimator_enabled,
bool transient_suppressor_enabled); bool transient_suppressor_enabled);
bool CaptureMultiBandSubModulesActive() const; bool CaptureMultiBandSubModulesActive() const;
@ -207,6 +208,7 @@ class AudioProcessingImpl : public AudioProcessing {
bool echo_controller_enabled_ = false; bool echo_controller_enabled_ = false;
bool level_estimator_enabled_ = false; bool level_estimator_enabled_ = false;
bool voice_activity_detector_enabled_ = false; bool voice_activity_detector_enabled_ = false;
bool private_voice_detector_enabled_ = false;
bool transient_suppressor_enabled_ = false; bool transient_suppressor_enabled_ = false;
bool first_update_ = true; bool first_update_ = true;
}; };

View File

@ -2696,7 +2696,7 @@ TEST(MAYBE_ApmStatistics, AEC2EnabledTest) {
// Set up an audioframe. // Set up an audioframe.
AudioFrame frame; AudioFrame frame;
frame.num_channels_ = 1; frame.num_channels_ = 1;
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz); SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern. // Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data(); int16_t* ptr = frame.mutable_data();
@ -2755,7 +2755,7 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) {
// Set up an audioframe. // Set up an audioframe.
AudioFrame frame; AudioFrame frame;
frame.num_channels_ = 1; frame.num_channels_ = 1;
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz); SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern. // Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data(); int16_t* ptr = frame.mutable_data();
@ -2809,7 +2809,7 @@ TEST(ApmStatistics, ReportOutputRmsDbfs) {
// Set up an audioframe. // Set up an audioframe.
AudioFrame frame; AudioFrame frame;
frame.num_channels_ = 1; frame.num_channels_ = 1;
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz); SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern. // Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data(); int16_t* ptr = frame.mutable_data();
@ -2838,4 +2838,41 @@ TEST(ApmStatistics, ReportOutputRmsDbfs) {
EXPECT_EQ(apm->ProcessStream(&frame), 0); EXPECT_EQ(apm->ProcessStream(&frame), 0);
EXPECT_FALSE(apm->GetStatistics(false).output_rms_dbfs); EXPECT_FALSE(apm->GetStatistics(false).output_rms_dbfs);
} }
TEST(ApmStatistics, ReportHasVoice) {
ProcessingConfig processing_config = {
{{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
AudioProcessing::Config config;
// Set up an audioframe.
AudioFrame frame;
frame.num_channels_ = 1;
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data();
for (size_t i = 0; i < frame.kMaxDataSizeSamples; i++) {
ptr[i] = 10000 * ((i % 3) - 1);
}
std::unique_ptr<AudioProcessing> apm(AudioProcessingBuilder().Create());
apm->Initialize(processing_config);
// If not enabled, no metric should be reported.
EXPECT_EQ(apm->ProcessStream(&frame), 0);
EXPECT_FALSE(apm->GetStatistics(false).voice_detected);
// If enabled, metrics should be reported.
config.voice_detection.enabled = true;
apm->ApplyConfig(config);
EXPECT_EQ(apm->ProcessStream(&frame), 0);
auto stats = apm->GetStatistics(false);
EXPECT_TRUE(stats.voice_detected);
// If re-disabled, the value is again not reported.
config.voice_detection.enabled = false;
apm->ApplyConfig(config);
EXPECT_EQ(apm->ProcessStream(&frame), 0);
EXPECT_FALSE(apm->GetStatistics(false).voice_detected);
}
} // namespace webrtc } // namespace webrtc

View File

@ -288,6 +288,11 @@ class AudioProcessing : public rtc::RefCountInterface {
bool enabled = false; bool enabled = false;
} level_estimation; } level_estimation;
// Enables reporting of |has_voice| in webrtc::AudioProcessingStats.
struct VoiceDetection {
bool enabled = false;
} voice_detection;
// Explicit copy assignment implementation to avoid issues with memory // Explicit copy assignment implementation to avoid issues with memory
// sanitizer complaints in case of self-assignment. // sanitizer complaints in case of self-assignment.
// TODO(peah): Add buildflag to ensure that this is only included for memory // TODO(peah): Add buildflag to ensure that this is only included for memory

View File

@ -32,6 +32,12 @@ struct RTC_EXPORT AudioProcessingStats {
// Only reported if level estimation is enabled in AudioProcessing::Config. // Only reported if level estimation is enabled in AudioProcessing::Config.
absl::optional<int> output_rms_dbfs; absl::optional<int> output_rms_dbfs;
// True if voice is detected in the last capture frame, after processing.
// It is conservative in flagging audio as speech, with low likelihood of
// incorrectly flagging a frame as voice.
// Only reported if voice detection is enabled in AudioProcessing::Config.
absl::optional<bool> voice_detected;
// AEC Statistics. // AEC Statistics.
// ERL = 10log_10(P_far / P_echo) // ERL = 10log_10(P_far / P_echo)
absl::optional<double> echo_return_loss; absl::optional<double> echo_return_loss;