AudioProcessingImpl: Add a VAD submodule

Add a VoiceActivityDetectorWrapper submodule in AudioProcessingImpl
and enable injecting speech probability into GainController2.

Bug: webrtc:13663
Change-Id: I05e13b737d085b45ac8ce76660191867c56834c2
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265166
Commit-Queue: Hanna Silen <silen@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#37275}
This commit is contained in:
Hanna Silen
2022-06-16 16:35:45 +02:00
committed by WebRTC LUCI CQ
parent ff45105b42
commit 0c1ad2992b
7 changed files with 348 additions and 17 deletions

View File

@ -162,6 +162,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
bool noise_suppressor_enabled,
bool adaptive_gain_controller_enabled,
bool gain_controller2_enabled,
bool voice_activity_detector_enabled,
bool gain_adjustment_enabled,
bool echo_controller_enabled,
bool transient_suppressor_enabled) {
@ -173,6 +174,8 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
changed |=
(adaptive_gain_controller_enabled != adaptive_gain_controller_enabled_);
changed |= (gain_controller2_enabled != gain_controller2_enabled_);
changed |=
(voice_activity_detector_enabled != voice_activity_detector_enabled_);
changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
changed |= (echo_controller_enabled != echo_controller_enabled_);
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
@ -182,6 +185,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
noise_suppressor_enabled_ = noise_suppressor_enabled;
adaptive_gain_controller_enabled_ = adaptive_gain_controller_enabled;
gain_controller2_enabled_ = gain_controller2_enabled;
voice_activity_detector_enabled_ = voice_activity_detector_enabled;
gain_adjustment_enabled_ = gain_adjustment_enabled;
echo_controller_enabled_ = echo_controller_enabled;
transient_suppressor_enabled_ = transient_suppressor_enabled;
@ -395,6 +399,7 @@ void AudioProcessingImpl::InitializeLocked() {
InitializeResidualEchoDetector();
InitializeEchoController();
InitializeGainController2(/*config_has_changed=*/true);
InitializeVoiceActivityDetector(/*config_has_changed=*/true);
InitializeNoiseSuppressor();
InitializeAnalyzer();
InitializePostProcessor();
@ -569,6 +574,7 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
}
InitializeGainController2(agc2_config_changed);
InitializeVoiceActivityDetector(agc2_config_changed);
if (pre_amplifier_config_changed || gain_adjustment_config_changed) {
InitializeCaptureLevelsAdjuster();
@ -1297,10 +1303,19 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
submodules_.capture_analyzer->Analyze(capture_buffer);
}
absl::optional<float> voice_activity_probability = absl::nullopt;
if (submodules_.gain_controller2) {
submodules_.gain_controller2->NotifyAnalogLevel(
recommended_stream_analog_level_locked());
submodules_.gain_controller2->Process(capture_buffer);
if (submodules_.voice_activity_detector) {
voice_activity_probability =
submodules_.voice_activity_detector->Analyze(
AudioFrameView<const float>(capture_buffer->channels(),
capture_buffer->num_channels(),
capture_buffer->num_frames()));
}
submodules_.gain_controller2->Process(voice_activity_probability,
capture_buffer);
}
if (submodules_.capture_post_processor) {
@ -1692,7 +1707,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
return submodule_states_.Update(
config_.high_pass_filter.enabled, !!submodules_.echo_control_mobile,
!!submodules_.noise_suppressor, !!submodules_.gain_control,
!!submodules_.gain_controller2,
!!submodules_.gain_controller2, !!submodules_.voice_activity_detector,
config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
capture_nonlocked_.echo_controller_enabled,
!!submodules_.transient_suppressor);
@ -1900,9 +1915,35 @@ void AudioProcessingImpl::InitializeGainController2(bool config_has_changed) {
return;
}
if (!submodules_.gain_controller2 || config_has_changed) {
const bool use_internal_vad =
transient_suppressor_vad_mode_ != TransientSuppressor::VadMode::kRnnVad;
submodules_.gain_controller2 = std::make_unique<GainController2>(
config_.gain_controller2, proc_fullband_sample_rate_hz(),
num_input_channels());
num_input_channels(), use_internal_vad);
}
}
void AudioProcessingImpl::InitializeVoiceActivityDetector(
bool config_has_changed) {
if (!config_has_changed) {
return;
}
const bool use_vad =
transient_suppressor_vad_mode_ == TransientSuppressor::VadMode::kRnnVad &&
config_.gain_controller2.enabled &&
config_.gain_controller2.adaptive_digital.enabled;
if (!use_vad) {
submodules_.voice_activity_detector.reset();
return;
}
if (!submodules_.voice_activity_detector || config_has_changed) {
RTC_DCHECK(!!submodules_.gain_controller2);
// TODO(bugs.webrtc.org/13663): Cache CPU features in APM and use here.
submodules_.voice_activity_detector =
std::make_unique<VoiceActivityDetectorWrapper>(
config_.gain_controller2.adaptive_digital.vad_reset_period_ms,
submodules_.gain_controller2->GetCpuFeatures(),
proc_fullband_sample_rate_hz());
}
}