Add private voice detection instance to replace public voice detector
This adds a second (!) VoiceDetection instance in APM, activated via webrtc::AudioProcessing::Config and which reports its values in the webrtc::AudioProcessingStats struct. The alternative is to reuse the existing instance, but that would require adding a proxy interface returned by AudioProcessing::voice_detection() to update the internal config of AudioProcessingImpl when calling voice_detection()->Enable(). Complexity-wise, no reasonable client will enable both interfaces simultaneously, so the footprint is negligible. Bug: webrtc:9947 Change-Id: I7d8e28b9bf06abab8f9c6822424bdb9d803b987d Reviewed-on: https://webrtc-review.googlesource.com/c/115243 Commit-Queue: Sam Zackrisson <saza@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#26101}
This commit is contained in:

committed by
Commit Bot

parent
d0fce0b1ec
commit
4db667be74
@ -140,6 +140,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
||||
bool pre_amplifier_enabled,
|
||||
bool echo_controller_enabled,
|
||||
bool voice_activity_detector_enabled,
|
||||
bool private_voice_detector_enabled,
|
||||
bool level_estimator_enabled,
|
||||
bool transient_suppressor_enabled) {
|
||||
bool changed = false;
|
||||
@ -159,6 +160,8 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
||||
changed |= (level_estimator_enabled != level_estimator_enabled_);
|
||||
changed |=
|
||||
(voice_activity_detector_enabled != voice_activity_detector_enabled_);
|
||||
changed |=
|
||||
(private_voice_detector_enabled != private_voice_detector_enabled_);
|
||||
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
|
||||
if (changed) {
|
||||
high_pass_filter_enabled_ = high_pass_filter_enabled;
|
||||
@ -172,6 +175,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
||||
echo_controller_enabled_ = echo_controller_enabled;
|
||||
level_estimator_enabled_ = level_estimator_enabled;
|
||||
voice_activity_detector_enabled_ = voice_activity_detector_enabled;
|
||||
private_voice_detector_enabled_ = private_voice_detector_enabled;
|
||||
transient_suppressor_enabled_ = transient_suppressor_enabled;
|
||||
}
|
||||
|
||||
@ -182,7 +186,8 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
||||
|
||||
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive()
|
||||
const {
|
||||
return CaptureMultiBandProcessingActive() || voice_activity_detector_enabled_;
|
||||
return CaptureMultiBandProcessingActive() ||
|
||||
voice_activity_detector_enabled_ || private_voice_detector_enabled_;
|
||||
}
|
||||
|
||||
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive()
|
||||
@ -260,6 +265,7 @@ struct AudioProcessingImpl::ApmPrivateSubmodules {
|
||||
std::unique_ptr<GainApplier> pre_amplifier;
|
||||
std::unique_ptr<CustomAudioAnalyzer> capture_analyzer;
|
||||
std::unique_ptr<LevelEstimatorImpl> output_level_estimator;
|
||||
std::unique_ptr<VoiceDetectionImpl> voice_detector;
|
||||
};
|
||||
|
||||
AudioProcessingBuilder::AudioProcessingBuilder() = default;
|
||||
@ -540,6 +546,10 @@ int AudioProcessingImpl::InitializeLocked() {
|
||||
public_submodules_->noise_suppression->Initialize(num_proc_channels(),
|
||||
proc_sample_rate_hz());
|
||||
public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz());
|
||||
if (private_submodules_->voice_detector) {
|
||||
private_submodules_->voice_detector->Initialize(
|
||||
proc_split_sample_rate_hz());
|
||||
}
|
||||
public_submodules_->level_estimator->Initialize();
|
||||
InitializeResidualEchoDetector();
|
||||
InitializeEchoController();
|
||||
@ -681,6 +691,16 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
|
||||
new LevelEstimatorImpl(&crit_capture_));
|
||||
private_submodules_->output_level_estimator->Enable(true);
|
||||
}
|
||||
|
||||
if (config_.voice_detection.enabled && !private_submodules_->voice_detector) {
|
||||
private_submodules_->voice_detector.reset(
|
||||
new VoiceDetectionImpl(&crit_capture_));
|
||||
private_submodules_->voice_detector->Enable(true);
|
||||
private_submodules_->voice_detector->set_likelihood(
|
||||
VoiceDetection::kVeryLowLikelihood);
|
||||
private_submodules_->voice_detector->Initialize(
|
||||
proc_split_sample_rate_hz());
|
||||
}
|
||||
}
|
||||
|
||||
void AudioProcessingImpl::SetExtraOptions(const webrtc::Config& config) {
|
||||
@ -1285,6 +1305,13 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
|
||||
}
|
||||
|
||||
public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer);
|
||||
if (config_.voice_detection.enabled) {
|
||||
private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
|
||||
capture_.stats.voice_detected =
|
||||
private_submodules_->voice_detector->stream_has_voice();
|
||||
} else {
|
||||
capture_.stats.voice_detected = absl::nullopt;
|
||||
}
|
||||
|
||||
if (constants_.use_experimental_agc &&
|
||||
public_submodules_->gain_control->is_enabled() &&
|
||||
@ -1695,6 +1722,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
|
||||
config_.gain_controller2.enabled, config_.pre_amplifier.enabled,
|
||||
capture_nonlocked_.echo_controller_enabled,
|
||||
public_submodules_->voice_detection->is_enabled(),
|
||||
config_.voice_detection.enabled,
|
||||
public_submodules_->level_estimator->is_enabled(),
|
||||
capture_.transient_suppressor_enabled);
|
||||
}
|
||||
|
@ -181,6 +181,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
||||
bool pre_amplifier_enabled,
|
||||
bool echo_controller_enabled,
|
||||
bool voice_activity_detector_enabled,
|
||||
bool private_voice_detector_enabled,
|
||||
bool level_estimator_enabled,
|
||||
bool transient_suppressor_enabled);
|
||||
bool CaptureMultiBandSubModulesActive() const;
|
||||
@ -207,6 +208,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
||||
bool echo_controller_enabled_ = false;
|
||||
bool level_estimator_enabled_ = false;
|
||||
bool voice_activity_detector_enabled_ = false;
|
||||
bool private_voice_detector_enabled_ = false;
|
||||
bool transient_suppressor_enabled_ = false;
|
||||
bool first_update_ = true;
|
||||
};
|
||||
|
@ -2696,7 +2696,7 @@ TEST(MAYBE_ApmStatistics, AEC2EnabledTest) {
|
||||
// Set up an audioframe.
|
||||
AudioFrame frame;
|
||||
frame.num_channels_ = 1;
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz);
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
|
||||
|
||||
// Fill the audio frame with a sawtooth pattern.
|
||||
int16_t* ptr = frame.mutable_data();
|
||||
@ -2755,7 +2755,7 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) {
|
||||
// Set up an audioframe.
|
||||
AudioFrame frame;
|
||||
frame.num_channels_ = 1;
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz);
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
|
||||
|
||||
// Fill the audio frame with a sawtooth pattern.
|
||||
int16_t* ptr = frame.mutable_data();
|
||||
@ -2809,7 +2809,7 @@ TEST(ApmStatistics, ReportOutputRmsDbfs) {
|
||||
// Set up an audioframe.
|
||||
AudioFrame frame;
|
||||
frame.num_channels_ = 1;
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz);
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
|
||||
|
||||
// Fill the audio frame with a sawtooth pattern.
|
||||
int16_t* ptr = frame.mutable_data();
|
||||
@ -2838,4 +2838,41 @@ TEST(ApmStatistics, ReportOutputRmsDbfs) {
|
||||
EXPECT_EQ(apm->ProcessStream(&frame), 0);
|
||||
EXPECT_FALSE(apm->GetStatistics(false).output_rms_dbfs);
|
||||
}
|
||||
|
||||
TEST(ApmStatistics, ReportHasVoice) {
|
||||
ProcessingConfig processing_config = {
|
||||
{{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
|
||||
AudioProcessing::Config config;
|
||||
|
||||
// Set up an audioframe.
|
||||
AudioFrame frame;
|
||||
frame.num_channels_ = 1;
|
||||
SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
|
||||
|
||||
// Fill the audio frame with a sawtooth pattern.
|
||||
int16_t* ptr = frame.mutable_data();
|
||||
for (size_t i = 0; i < frame.kMaxDataSizeSamples; i++) {
|
||||
ptr[i] = 10000 * ((i % 3) - 1);
|
||||
}
|
||||
|
||||
std::unique_ptr<AudioProcessing> apm(AudioProcessingBuilder().Create());
|
||||
apm->Initialize(processing_config);
|
||||
|
||||
// If not enabled, no metric should be reported.
|
||||
EXPECT_EQ(apm->ProcessStream(&frame), 0);
|
||||
EXPECT_FALSE(apm->GetStatistics(false).voice_detected);
|
||||
|
||||
// If enabled, metrics should be reported.
|
||||
config.voice_detection.enabled = true;
|
||||
apm->ApplyConfig(config);
|
||||
EXPECT_EQ(apm->ProcessStream(&frame), 0);
|
||||
auto stats = apm->GetStatistics(false);
|
||||
EXPECT_TRUE(stats.voice_detected);
|
||||
|
||||
// If re-disabled, the value is again not reported.
|
||||
config.voice_detection.enabled = false;
|
||||
apm->ApplyConfig(config);
|
||||
EXPECT_EQ(apm->ProcessStream(&frame), 0);
|
||||
EXPECT_FALSE(apm->GetStatistics(false).voice_detected);
|
||||
}
|
||||
} // namespace webrtc
|
||||
|
@ -288,6 +288,11 @@ class AudioProcessing : public rtc::RefCountInterface {
|
||||
bool enabled = false;
|
||||
} level_estimation;
|
||||
|
||||
// Enables reporting of |has_voice| in webrtc::AudioProcessingStats.
|
||||
struct VoiceDetection {
|
||||
bool enabled = false;
|
||||
} voice_detection;
|
||||
|
||||
// Explicit copy assignment implementation to avoid issues with memory
|
||||
// sanitizer complaints in case of self-assignment.
|
||||
// TODO(peah): Add buildflag to ensure that this is only included for memory
|
||||
|
@ -32,6 +32,12 @@ struct RTC_EXPORT AudioProcessingStats {
|
||||
// Only reported if level estimation is enabled in AudioProcessing::Config.
|
||||
absl::optional<int> output_rms_dbfs;
|
||||
|
||||
// True if voice is detected in the last capture frame, after processing.
|
||||
// It is conservative in flagging audio as speech, with low likelihood of
|
||||
// incorrectly flagging a frame as voice.
|
||||
// Only reported if voice detection is enabled in AudioProcessing::Config.
|
||||
absl::optional<bool> voice_detected;
|
||||
|
||||
// AEC Statistics.
|
||||
// ERL = 10log_10(P_far / P_echo)
|
||||
absl::optional<double> echo_return_loss;
|
||||
|
Reference in New Issue
Block a user