Delete voice_detection() pointer to submodule
The new configuration path is via AudioProcessing::ApplyConfig and AudioProcessing::GetStatistics. ApmTest.Process passes with unchanged reference files if audio_processing_impl would initialize the VAD with VoiceDetection::kLowLikelihood instead of kVeryLowLikelihood. This was verified by testing this CL with that modification. Bug: webrtc:9878 Change-Id: I4d08df37a07e5c72feeec02a07d6b9435f917d72 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/155445 Commit-Queue: Sam Zackrisson <saza@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#29395}
This commit is contained in:

committed by
Commit Bot

parent
24d251f796
commit
0824c6f61a
@ -156,8 +156,8 @@ rtc_static_library("audio_processing") {
|
|||||||
"transient/wpd_tree.h",
|
"transient/wpd_tree.h",
|
||||||
"typing_detection.cc",
|
"typing_detection.cc",
|
||||||
"typing_detection.h",
|
"typing_detection.h",
|
||||||
"voice_detection_impl.cc",
|
"voice_detection.cc",
|
||||||
"voice_detection_impl.h",
|
"voice_detection.h",
|
||||||
]
|
]
|
||||||
|
|
||||||
defines = []
|
defines = []
|
||||||
|
@ -40,7 +40,7 @@
|
|||||||
#include "modules/audio_processing/noise_suppression_proxy.h"
|
#include "modules/audio_processing/noise_suppression_proxy.h"
|
||||||
#include "modules/audio_processing/residual_echo_detector.h"
|
#include "modules/audio_processing/residual_echo_detector.h"
|
||||||
#include "modules/audio_processing/transient/transient_suppressor.h"
|
#include "modules/audio_processing/transient/transient_suppressor.h"
|
||||||
#include "modules/audio_processing/voice_detection_impl.h"
|
#include "modules/audio_processing/voice_detection.h"
|
||||||
#include "rtc_base/atomic_ops.h"
|
#include "rtc_base/atomic_ops.h"
|
||||||
#include "rtc_base/checks.h"
|
#include "rtc_base/checks.h"
|
||||||
#include "rtc_base/constructor_magic.h"
|
#include "rtc_base/constructor_magic.h"
|
||||||
@ -165,8 +165,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
|||||||
bool gain_controller2_enabled,
|
bool gain_controller2_enabled,
|
||||||
bool pre_amplifier_enabled,
|
bool pre_amplifier_enabled,
|
||||||
bool echo_controller_enabled,
|
bool echo_controller_enabled,
|
||||||
bool voice_activity_detector_enabled,
|
bool voice_detector_enabled,
|
||||||
bool private_voice_detector_enabled,
|
|
||||||
bool level_estimator_enabled,
|
bool level_estimator_enabled,
|
||||||
bool transient_suppressor_enabled) {
|
bool transient_suppressor_enabled) {
|
||||||
bool changed = false;
|
bool changed = false;
|
||||||
@ -183,10 +182,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
|||||||
changed |= (pre_amplifier_enabled_ != pre_amplifier_enabled);
|
changed |= (pre_amplifier_enabled_ != pre_amplifier_enabled);
|
||||||
changed |= (echo_controller_enabled != echo_controller_enabled_);
|
changed |= (echo_controller_enabled != echo_controller_enabled_);
|
||||||
changed |= (level_estimator_enabled != level_estimator_enabled_);
|
changed |= (level_estimator_enabled != level_estimator_enabled_);
|
||||||
changed |=
|
changed |= (voice_detector_enabled != voice_detector_enabled_);
|
||||||
(voice_activity_detector_enabled != voice_activity_detector_enabled_);
|
|
||||||
changed |=
|
|
||||||
(private_voice_detector_enabled != private_voice_detector_enabled_);
|
|
||||||
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
|
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
|
||||||
if (changed) {
|
if (changed) {
|
||||||
high_pass_filter_enabled_ = high_pass_filter_enabled;
|
high_pass_filter_enabled_ = high_pass_filter_enabled;
|
||||||
@ -199,8 +195,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
|||||||
pre_amplifier_enabled_ = pre_amplifier_enabled;
|
pre_amplifier_enabled_ = pre_amplifier_enabled;
|
||||||
echo_controller_enabled_ = echo_controller_enabled;
|
echo_controller_enabled_ = echo_controller_enabled;
|
||||||
level_estimator_enabled_ = level_estimator_enabled;
|
level_estimator_enabled_ = level_estimator_enabled;
|
||||||
voice_activity_detector_enabled_ = voice_activity_detector_enabled;
|
voice_detector_enabled_ = voice_detector_enabled;
|
||||||
private_voice_detector_enabled_ = private_voice_detector_enabled;
|
|
||||||
transient_suppressor_enabled_ = transient_suppressor_enabled;
|
transient_suppressor_enabled_ = transient_suppressor_enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,8 +206,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
|
|||||||
|
|
||||||
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive()
|
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive()
|
||||||
const {
|
const {
|
||||||
return CaptureMultiBandProcessingActive() ||
|
return CaptureMultiBandProcessingActive() || voice_detector_enabled_;
|
||||||
voice_activity_detector_enabled_ || private_voice_detector_enabled_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive()
|
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive()
|
||||||
@ -263,7 +257,6 @@ struct AudioProcessingImpl::ApmPublicSubmodules {
|
|||||||
std::unique_ptr<LevelEstimatorImpl> level_estimator;
|
std::unique_ptr<LevelEstimatorImpl> level_estimator;
|
||||||
std::unique_ptr<NoiseSuppressionImpl> noise_suppression;
|
std::unique_ptr<NoiseSuppressionImpl> noise_suppression;
|
||||||
std::unique_ptr<NoiseSuppressionProxy> noise_suppression_proxy;
|
std::unique_ptr<NoiseSuppressionProxy> noise_suppression_proxy;
|
||||||
std::unique_ptr<VoiceDetectionImpl> voice_detection;
|
|
||||||
std::unique_ptr<GainControlImpl> gain_control;
|
std::unique_ptr<GainControlImpl> gain_control;
|
||||||
std::unique_ptr<GainControlForExperimentalAgc>
|
std::unique_ptr<GainControlForExperimentalAgc>
|
||||||
gain_control_for_experimental_agc;
|
gain_control_for_experimental_agc;
|
||||||
@ -295,7 +288,7 @@ struct AudioProcessingImpl::ApmPrivateSubmodules {
|
|||||||
std::unique_ptr<GainApplier> pre_amplifier;
|
std::unique_ptr<GainApplier> pre_amplifier;
|
||||||
std::unique_ptr<CustomAudioAnalyzer> capture_analyzer;
|
std::unique_ptr<CustomAudioAnalyzer> capture_analyzer;
|
||||||
std::unique_ptr<LevelEstimatorImpl> output_level_estimator;
|
std::unique_ptr<LevelEstimatorImpl> output_level_estimator;
|
||||||
std::unique_ptr<VoiceDetectionImpl> voice_detector;
|
std::unique_ptr<VoiceDetection> voice_detector;
|
||||||
};
|
};
|
||||||
|
|
||||||
AudioProcessingBuilder::AudioProcessingBuilder() = default;
|
AudioProcessingBuilder::AudioProcessingBuilder() = default;
|
||||||
@ -415,8 +408,6 @@ AudioProcessingImpl::AudioProcessingImpl(
|
|||||||
new NoiseSuppressionImpl(&crit_capture_));
|
new NoiseSuppressionImpl(&crit_capture_));
|
||||||
public_submodules_->noise_suppression_proxy.reset(new NoiseSuppressionProxy(
|
public_submodules_->noise_suppression_proxy.reset(new NoiseSuppressionProxy(
|
||||||
this, public_submodules_->noise_suppression.get()));
|
this, public_submodules_->noise_suppression.get()));
|
||||||
public_submodules_->voice_detection.reset(
|
|
||||||
new VoiceDetectionImpl(&crit_capture_));
|
|
||||||
public_submodules_->gain_control_for_experimental_agc.reset(
|
public_submodules_->gain_control_for_experimental_agc.reset(
|
||||||
new GainControlForExperimentalAgc(
|
new GainControlForExperimentalAgc(
|
||||||
public_submodules_->gain_control.get()));
|
public_submodules_->gain_control.get()));
|
||||||
@ -556,11 +547,7 @@ int AudioProcessingImpl::InitializeLocked() {
|
|||||||
InitializeHighPassFilter();
|
InitializeHighPassFilter();
|
||||||
public_submodules_->noise_suppression->Initialize(num_proc_channels(),
|
public_submodules_->noise_suppression->Initialize(num_proc_channels(),
|
||||||
proc_sample_rate_hz());
|
proc_sample_rate_hz());
|
||||||
public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz());
|
InitializeVoiceDetector();
|
||||||
if (private_submodules_->voice_detector) {
|
|
||||||
private_submodules_->voice_detector->Initialize(
|
|
||||||
proc_split_sample_rate_hz());
|
|
||||||
}
|
|
||||||
public_submodules_->level_estimator->Initialize();
|
public_submodules_->level_estimator->Initialize();
|
||||||
InitializeResidualEchoDetector();
|
InitializeResidualEchoDetector();
|
||||||
InitializeEchoController();
|
InitializeEchoController();
|
||||||
@ -702,6 +689,9 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
|
|||||||
config_.gain_controller1.analog_level_maximum !=
|
config_.gain_controller1.analog_level_maximum !=
|
||||||
config.gain_controller1.analog_level_maximum;
|
config.gain_controller1.analog_level_maximum;
|
||||||
|
|
||||||
|
const bool voice_detection_config_changed =
|
||||||
|
config_.voice_detection.enabled != config.voice_detection.enabled;
|
||||||
|
|
||||||
config_ = config;
|
config_ = config;
|
||||||
|
|
||||||
if (aec_config_changed) {
|
if (aec_config_changed) {
|
||||||
@ -745,14 +735,8 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
|
|||||||
private_submodules_->output_level_estimator->Enable(true);
|
private_submodules_->output_level_estimator->Enable(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config_.voice_detection.enabled && !private_submodules_->voice_detector) {
|
if (voice_detection_config_changed) {
|
||||||
private_submodules_->voice_detector.reset(
|
InitializeVoiceDetector();
|
||||||
new VoiceDetectionImpl(&crit_capture_));
|
|
||||||
private_submodules_->voice_detector->Enable(true);
|
|
||||||
private_submodules_->voice_detector->set_likelihood(
|
|
||||||
VoiceDetection::kVeryLowLikelihood);
|
|
||||||
private_submodules_->voice_detector->Initialize(
|
|
||||||
proc_split_sample_rate_hz());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reinitialization must happen after all submodule configuration to avoid
|
// Reinitialization must happen after all submodule configuration to avoid
|
||||||
@ -1276,14 +1260,17 @@ int AudioProcessingImpl::ProcessStream(AudioFrame* frame) {
|
|||||||
RecordUnprocessedCaptureStream(*frame);
|
RecordUnprocessedCaptureStream(*frame);
|
||||||
}
|
}
|
||||||
|
|
||||||
capture_.vad_activity = frame->vad_activity_;
|
|
||||||
capture_.capture_audio->CopyFrom(frame);
|
capture_.capture_audio->CopyFrom(frame);
|
||||||
RETURN_ON_ERR(ProcessCaptureStreamLocked());
|
RETURN_ON_ERR(ProcessCaptureStreamLocked());
|
||||||
if (submodule_states_.CaptureMultiBandProcessingActive() ||
|
if (submodule_states_.CaptureMultiBandProcessingActive() ||
|
||||||
submodule_states_.CaptureFullBandProcessingActive()) {
|
submodule_states_.CaptureFullBandProcessingActive()) {
|
||||||
capture_.capture_audio->CopyTo(frame);
|
capture_.capture_audio->CopyTo(frame);
|
||||||
}
|
}
|
||||||
frame->vad_activity_ = capture_.vad_activity;
|
if (capture_.stats.voice_detected) {
|
||||||
|
frame->vad_activity_ = *capture_.stats.voice_detected
|
||||||
|
? AudioFrame::kVadActive
|
||||||
|
: AudioFrame::kVadPassive;
|
||||||
|
}
|
||||||
|
|
||||||
if (aec_dump_) {
|
if (aec_dump_) {
|
||||||
RecordProcessedCaptureStream(*frame);
|
RecordProcessedCaptureStream(*frame);
|
||||||
@ -1432,19 +1419,10 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
|
|||||||
public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
|
public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (public_submodules_->voice_detection->is_enabled() &&
|
|
||||||
!public_submodules_->voice_detection->using_external_vad()) {
|
|
||||||
bool voice_active =
|
|
||||||
public_submodules_->voice_detection->ProcessCaptureAudio(
|
|
||||||
capture_buffer);
|
|
||||||
capture_.vad_activity =
|
|
||||||
voice_active ? AudioFrame::kVadActive : AudioFrame::kVadPassive;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (config_.voice_detection.enabled) {
|
if (config_.voice_detection.enabled) {
|
||||||
private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
|
|
||||||
capture_.stats.voice_detected =
|
capture_.stats.voice_detected =
|
||||||
private_submodules_->voice_detector->stream_has_voice();
|
private_submodules_->voice_detector->ProcessCaptureAudio(
|
||||||
|
capture_buffer);
|
||||||
} else {
|
} else {
|
||||||
capture_.stats.voice_detected = absl::nullopt;
|
capture_.stats.voice_detected = absl::nullopt;
|
||||||
}
|
}
|
||||||
@ -1817,10 +1795,6 @@ NoiseSuppression* AudioProcessingImpl::noise_suppression() const {
|
|||||||
return public_submodules_->noise_suppression_proxy.get();
|
return public_submodules_->noise_suppression_proxy.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
VoiceDetection* AudioProcessingImpl::voice_detection() const {
|
|
||||||
return public_submodules_->voice_detection.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
void AudioProcessingImpl::MutateConfig(
|
void AudioProcessingImpl::MutateConfig(
|
||||||
rtc::FunctionView<void(AudioProcessing::Config*)> mutator) {
|
rtc::FunctionView<void(AudioProcessing::Config*)> mutator) {
|
||||||
rtc::CritScope cs_render(&crit_render_);
|
rtc::CritScope cs_render(&crit_render_);
|
||||||
@ -1845,7 +1819,6 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
|
|||||||
public_submodules_->gain_control->is_enabled(),
|
public_submodules_->gain_control->is_enabled(),
|
||||||
config_.gain_controller2.enabled, config_.pre_amplifier.enabled,
|
config_.gain_controller2.enabled, config_.pre_amplifier.enabled,
|
||||||
capture_nonlocked_.echo_controller_enabled,
|
capture_nonlocked_.echo_controller_enabled,
|
||||||
public_submodules_->voice_detection->is_enabled(),
|
|
||||||
config_.voice_detection.enabled,
|
config_.voice_detection.enabled,
|
||||||
public_submodules_->level_estimator->is_enabled(),
|
public_submodules_->level_estimator->is_enabled(),
|
||||||
capture_.transient_suppressor_enabled);
|
capture_.transient_suppressor_enabled);
|
||||||
@ -1871,6 +1844,14 @@ void AudioProcessingImpl::InitializeHighPassFilter() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AudioProcessingImpl::InitializeVoiceDetector() {
|
||||||
|
if (config_.voice_detection.enabled) {
|
||||||
|
private_submodules_->voice_detector = std::make_unique<VoiceDetection>(
|
||||||
|
proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
|
||||||
|
} else {
|
||||||
|
private_submodules_->voice_detector.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
void AudioProcessingImpl::InitializeEchoController() {
|
void AudioProcessingImpl::InitializeEchoController() {
|
||||||
bool use_echo_controller =
|
bool use_echo_controller =
|
||||||
echo_control_factory_ ||
|
echo_control_factory_ ||
|
||||||
|
@ -122,7 +122,6 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
GainControl* gain_control() const override;
|
GainControl* gain_control() const override;
|
||||||
LevelEstimator* level_estimator() const override;
|
LevelEstimator* level_estimator() const override;
|
||||||
NoiseSuppression* noise_suppression() const override;
|
NoiseSuppression* noise_suppression() const override;
|
||||||
VoiceDetection* voice_detection() const override;
|
|
||||||
|
|
||||||
// TODO(peah): Remove MutateConfig once the new API allows that.
|
// TODO(peah): Remove MutateConfig once the new API allows that.
|
||||||
void MutateConfig(rtc::FunctionView<void(AudioProcessing::Config*)> mutator);
|
void MutateConfig(rtc::FunctionView<void(AudioProcessing::Config*)> mutator);
|
||||||
@ -182,8 +181,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
bool gain_controller2_enabled,
|
bool gain_controller2_enabled,
|
||||||
bool pre_amplifier_enabled,
|
bool pre_amplifier_enabled,
|
||||||
bool echo_controller_enabled,
|
bool echo_controller_enabled,
|
||||||
bool voice_activity_detector_enabled,
|
bool voice_detector_enabled,
|
||||||
bool private_voice_detector_enabled,
|
|
||||||
bool level_estimator_enabled,
|
bool level_estimator_enabled,
|
||||||
bool transient_suppressor_enabled);
|
bool transient_suppressor_enabled);
|
||||||
bool CaptureMultiBandSubModulesActive() const;
|
bool CaptureMultiBandSubModulesActive() const;
|
||||||
@ -209,8 +207,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
bool pre_amplifier_enabled_ = false;
|
bool pre_amplifier_enabled_ = false;
|
||||||
bool echo_controller_enabled_ = false;
|
bool echo_controller_enabled_ = false;
|
||||||
bool level_estimator_enabled_ = false;
|
bool level_estimator_enabled_ = false;
|
||||||
bool voice_activity_detector_enabled_ = false;
|
bool voice_detector_enabled_ = false;
|
||||||
bool private_voice_detector_enabled_ = false;
|
|
||||||
bool transient_suppressor_enabled_ = false;
|
bool transient_suppressor_enabled_ = false;
|
||||||
bool first_update_ = true;
|
bool first_update_ = true;
|
||||||
};
|
};
|
||||||
@ -239,6 +236,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
void InitializeResidualEchoDetector()
|
void InitializeResidualEchoDetector()
|
||||||
RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_render_, crit_capture_);
|
RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_render_, crit_capture_);
|
||||||
void InitializeHighPassFilter() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
|
void InitializeHighPassFilter() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
|
||||||
|
void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
|
||||||
void InitializeEchoController()
|
void InitializeEchoController()
|
||||||
RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_render_, crit_capture_);
|
RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_render_, crit_capture_);
|
||||||
void InitializeGainController2() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
|
void InitializeGainController2() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
|
||||||
@ -405,7 +403,6 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
size_t num_keyboard_frames = 0;
|
size_t num_keyboard_frames = 0;
|
||||||
const float* keyboard_data = nullptr;
|
const float* keyboard_data = nullptr;
|
||||||
} keyboard_info;
|
} keyboard_info;
|
||||||
AudioFrame::VADActivity vad_activity = AudioFrame::kVadUnknown;
|
|
||||||
} capture_ RTC_GUARDED_BY(crit_capture_);
|
} capture_ RTC_GUARDED_BY(crit_capture_);
|
||||||
|
|
||||||
struct ApmCaptureNonLockedState {
|
struct ApmCaptureNonLockedState {
|
||||||
|
@ -595,7 +595,6 @@ void StatsProcessor::Process() {
|
|||||||
|
|
||||||
// The below return values are not testable.
|
// The below return values are not testable.
|
||||||
apm_->noise_suppression()->speech_probability();
|
apm_->noise_suppression()->speech_probability();
|
||||||
apm_->voice_detection()->is_enabled();
|
|
||||||
|
|
||||||
apm_->GetStatistics(/*has_remote_tracks=*/true);
|
apm_->GetStatistics(/*has_remote_tracks=*/true);
|
||||||
}
|
}
|
||||||
|
@ -193,12 +193,11 @@ void EnableAllAPComponents(AudioProcessing* ap) {
|
|||||||
|
|
||||||
apm_config.high_pass_filter.enabled = true;
|
apm_config.high_pass_filter.enabled = true;
|
||||||
apm_config.level_estimation.enabled = true;
|
apm_config.level_estimation.enabled = true;
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
ap->ApplyConfig(apm_config);
|
ap->ApplyConfig(apm_config);
|
||||||
|
|
||||||
EXPECT_NOERR(ap->level_estimator()->Enable(true));
|
EXPECT_NOERR(ap->level_estimator()->Enable(true));
|
||||||
EXPECT_NOERR(ap->noise_suppression()->Enable(true));
|
EXPECT_NOERR(ap->noise_suppression()->Enable(true));
|
||||||
|
|
||||||
EXPECT_NOERR(ap->voice_detection()->Enable(true));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// These functions are only used by ApmTest.Process.
|
// These functions are only used by ApmTest.Process.
|
||||||
@ -1114,63 +1113,6 @@ TEST_F(ApmTest, LevelEstimator) {
|
|||||||
EXPECT_EQ(90, apm_->level_estimator()->RMS());
|
EXPECT_EQ(90, apm_->level_estimator()->RMS());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ApmTest, VoiceDetection) {
|
|
||||||
// Test external VAD
|
|
||||||
EXPECT_EQ(apm_->kNoError,
|
|
||||||
apm_->voice_detection()->set_stream_has_voice(true));
|
|
||||||
EXPECT_TRUE(apm_->voice_detection()->stream_has_voice());
|
|
||||||
EXPECT_EQ(apm_->kNoError,
|
|
||||||
apm_->voice_detection()->set_stream_has_voice(false));
|
|
||||||
EXPECT_FALSE(apm_->voice_detection()->stream_has_voice());
|
|
||||||
|
|
||||||
// Test valid likelihoods
|
|
||||||
VoiceDetection::Likelihood likelihood[] = {
|
|
||||||
VoiceDetection::kVeryLowLikelihood, VoiceDetection::kLowLikelihood,
|
|
||||||
VoiceDetection::kModerateLikelihood, VoiceDetection::kHighLikelihood};
|
|
||||||
for (size_t i = 0; i < arraysize(likelihood); i++) {
|
|
||||||
EXPECT_EQ(apm_->kNoError,
|
|
||||||
apm_->voice_detection()->set_likelihood(likelihood[i]));
|
|
||||||
EXPECT_EQ(likelihood[i], apm_->voice_detection()->likelihood());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* TODO(bjornv): Enable once VAD supports other frame lengths than 10 ms
|
|
||||||
// Test invalid frame sizes
|
|
||||||
EXPECT_EQ(apm_->kBadParameterError,
|
|
||||||
apm_->voice_detection()->set_frame_size_ms(12));
|
|
||||||
|
|
||||||
// Test valid frame sizes
|
|
||||||
for (int i = 10; i <= 30; i += 10) {
|
|
||||||
EXPECT_EQ(apm_->kNoError,
|
|
||||||
apm_->voice_detection()->set_frame_size_ms(i));
|
|
||||||
EXPECT_EQ(i, apm_->voice_detection()->frame_size_ms());
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Turn VAD on/off
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
|
|
||||||
EXPECT_TRUE(apm_->voice_detection()->is_enabled());
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
|
|
||||||
EXPECT_FALSE(apm_->voice_detection()->is_enabled());
|
|
||||||
|
|
||||||
// Test that AudioFrame activity is maintained when VAD is disabled.
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
|
|
||||||
AudioFrame::VADActivity activity[] = {
|
|
||||||
AudioFrame::kVadActive, AudioFrame::kVadPassive, AudioFrame::kVadUnknown};
|
|
||||||
for (size_t i = 0; i < arraysize(activity); i++) {
|
|
||||||
frame_->vad_activity_ = activity[i];
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
|
||||||
EXPECT_EQ(activity[i], frame_->vad_activity_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test that AudioFrame activity is set when VAD is enabled.
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
|
|
||||||
frame_->vad_activity_ = AudioFrame::kVadUnknown;
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
|
||||||
EXPECT_NE(AudioFrame::kVadUnknown, frame_->vad_activity_);
|
|
||||||
|
|
||||||
// TODO(bjornv): Add tests for streamed voice; stream_has_voice()
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_F(ApmTest, AllProcessingDisabledByDefault) {
|
TEST_F(ApmTest, AllProcessingDisabledByDefault) {
|
||||||
AudioProcessing::Config config = apm_->GetConfig();
|
AudioProcessing::Config config = apm_->GetConfig();
|
||||||
EXPECT_FALSE(config.echo_canceller.enabled);
|
EXPECT_FALSE(config.echo_canceller.enabled);
|
||||||
@ -1180,7 +1122,6 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) {
|
|||||||
EXPECT_FALSE(apm_->gain_control()->is_enabled());
|
EXPECT_FALSE(apm_->gain_control()->is_enabled());
|
||||||
EXPECT_FALSE(apm_->level_estimator()->is_enabled());
|
EXPECT_FALSE(apm_->level_estimator()->is_enabled());
|
||||||
EXPECT_FALSE(apm_->noise_suppression()->is_enabled());
|
EXPECT_FALSE(apm_->noise_suppression()->is_enabled());
|
||||||
EXPECT_FALSE(apm_->voice_detection()->is_enabled());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
|
TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
|
||||||
@ -1282,16 +1223,7 @@ TEST_F(ApmTest, SplittingFilter) {
|
|||||||
EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
|
EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(false));
|
EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(false));
|
||||||
|
|
||||||
// 3. Only VAD is enabled...
|
// 3. Only GetStatistics-reporting VAD is enabled...
|
||||||
SetFrameTo(frame_, 1000);
|
|
||||||
frame_copy.CopyFrom(*frame_);
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
|
||||||
EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
|
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
|
|
||||||
|
|
||||||
// 4. Only GetStatistics-reporting VAD is enabled...
|
|
||||||
SetFrameTo(frame_, 1000);
|
SetFrameTo(frame_, 1000);
|
||||||
frame_copy.CopyFrom(*frame_);
|
frame_copy.CopyFrom(*frame_);
|
||||||
auto apm_config = apm_->GetConfig();
|
auto apm_config = apm_->GetConfig();
|
||||||
@ -1303,18 +1235,16 @@ TEST_F(ApmTest, SplittingFilter) {
|
|||||||
apm_config.voice_detection.enabled = false;
|
apm_config.voice_detection.enabled = false;
|
||||||
apm_->ApplyConfig(apm_config);
|
apm_->ApplyConfig(apm_config);
|
||||||
|
|
||||||
// 5. Both VADs and the level estimator are enabled...
|
// 4. Both the VAD and the level estimator are enabled...
|
||||||
SetFrameTo(frame_, 1000);
|
SetFrameTo(frame_, 1000);
|
||||||
frame_copy.CopyFrom(*frame_);
|
frame_copy.CopyFrom(*frame_);
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(true));
|
EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(true));
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
|
|
||||||
apm_config.voice_detection.enabled = true;
|
apm_config.voice_detection.enabled = true;
|
||||||
apm_->ApplyConfig(apm_config);
|
apm_->ApplyConfig(apm_config);
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
|
||||||
EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
|
EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(false));
|
EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(false));
|
||||||
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
|
|
||||||
apm_config.voice_detection.enabled = false;
|
apm_config.voice_detection.enabled = false;
|
||||||
apm_->ApplyConfig(apm_config);
|
apm_->ApplyConfig(apm_config);
|
||||||
|
|
||||||
@ -1652,18 +1582,15 @@ TEST_F(ApmTest, Process) {
|
|||||||
if (apm_->gain_control()->stream_is_saturated()) {
|
if (apm_->gain_control()->stream_is_saturated()) {
|
||||||
is_saturated_count++;
|
is_saturated_count++;
|
||||||
}
|
}
|
||||||
if (apm_->voice_detection()->stream_has_voice()) {
|
|
||||||
has_voice_count++;
|
|
||||||
EXPECT_EQ(AudioFrame::kVadActive, frame_->vad_activity_);
|
|
||||||
} else {
|
|
||||||
EXPECT_EQ(AudioFrame::kVadPassive, frame_->vad_activity_);
|
|
||||||
}
|
|
||||||
|
|
||||||
ns_speech_prob_average += apm_->noise_suppression()->speech_probability();
|
|
||||||
AudioProcessingStats stats =
|
AudioProcessingStats stats =
|
||||||
apm_->GetStatistics(/*has_remote_tracks=*/false);
|
apm_->GetStatistics(/*has_remote_tracks=*/false);
|
||||||
|
EXPECT_TRUE(stats.voice_detected);
|
||||||
|
EXPECT_TRUE(stats.output_rms_dbfs);
|
||||||
|
has_voice_count += *stats.voice_detected ? 1 : 0;
|
||||||
rms_dbfs_average += *stats.output_rms_dbfs;
|
rms_dbfs_average += *stats.output_rms_dbfs;
|
||||||
|
|
||||||
|
ns_speech_prob_average += apm_->noise_suppression()->speech_probability();
|
||||||
|
|
||||||
size_t frame_size = frame_->samples_per_channel_ * frame_->num_channels_;
|
size_t frame_size = frame_->samples_per_channel_ * frame_->num_channels_;
|
||||||
size_t write_count =
|
size_t write_count =
|
||||||
fwrite(frame_->data(), sizeof(int16_t), frame_size, out_file_);
|
fwrite(frame_->data(), sizeof(int16_t), frame_size, out_file_);
|
||||||
@ -2566,7 +2493,6 @@ std::unique_ptr<AudioProcessing> CreateApm(bool mobile_aec) {
|
|||||||
EXPECT_EQ(apm->gain_control()->Enable(false), 0);
|
EXPECT_EQ(apm->gain_control()->Enable(false), 0);
|
||||||
EXPECT_EQ(apm->level_estimator()->Enable(false), 0);
|
EXPECT_EQ(apm->level_estimator()->Enable(false), 0);
|
||||||
EXPECT_EQ(apm->noise_suppression()->Enable(false), 0);
|
EXPECT_EQ(apm->noise_suppression()->Enable(false), 0);
|
||||||
EXPECT_EQ(apm->voice_detection()->Enable(false), 0);
|
|
||||||
return apm;
|
return apm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -53,7 +53,6 @@ class LevelEstimator;
|
|||||||
class NoiseSuppression;
|
class NoiseSuppression;
|
||||||
class CustomAudioAnalyzer;
|
class CustomAudioAnalyzer;
|
||||||
class CustomProcessing;
|
class CustomProcessing;
|
||||||
class VoiceDetection;
|
|
||||||
|
|
||||||
// Use to enable the extended filter mode in the AEC, along with robustness
|
// Use to enable the extended filter mode in the AEC, along with robustness
|
||||||
// measures around the reported system delays. It comes with a significant
|
// measures around the reported system delays. It comes with a significant
|
||||||
@ -287,7 +286,10 @@ class AudioProcessing : public rtc::RefCountInterface {
|
|||||||
Level level = kModerate;
|
Level level = kModerate;
|
||||||
} noise_suppression;
|
} noise_suppression;
|
||||||
|
|
||||||
// Enables reporting of |has_voice| in webrtc::AudioProcessingStats.
|
// Enables reporting of |voice_detected| in webrtc::AudioProcessingStats.
|
||||||
|
// In addition to |voice_detected|, VAD decision is provided through the
|
||||||
|
// |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will
|
||||||
|
// be modified to reflect the current decision.
|
||||||
struct VoiceDetection {
|
struct VoiceDetection {
|
||||||
bool enabled = false;
|
bool enabled = false;
|
||||||
} voice_detection;
|
} voice_detection;
|
||||||
@ -685,7 +687,6 @@ class AudioProcessing : public rtc::RefCountInterface {
|
|||||||
virtual GainControl* gain_control() const = 0;
|
virtual GainControl* gain_control() const = 0;
|
||||||
virtual LevelEstimator* level_estimator() const = 0;
|
virtual LevelEstimator* level_estimator() const = 0;
|
||||||
virtual NoiseSuppression* noise_suppression() const = 0;
|
virtual NoiseSuppression* noise_suppression() const = 0;
|
||||||
virtual VoiceDetection* voice_detection() const = 0;
|
|
||||||
|
|
||||||
// Returns the last applied configuration.
|
// Returns the last applied configuration.
|
||||||
virtual AudioProcessing::Config GetConfig() const = 0;
|
virtual AudioProcessing::Config GetConfig() const = 0;
|
||||||
@ -981,56 +982,6 @@ class EchoDetector : public rtc::RefCountInterface {
|
|||||||
virtual Metrics GetMetrics() const = 0;
|
virtual Metrics GetMetrics() const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
// The voice activity detection (VAD) component analyzes the stream to
|
|
||||||
// determine if voice is present. A facility is also provided to pass in an
|
|
||||||
// external VAD decision.
|
|
||||||
//
|
|
||||||
// In addition to |stream_has_voice()| the VAD decision is provided through the
|
|
||||||
// |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will be
|
|
||||||
// modified to reflect the current decision.
|
|
||||||
class VoiceDetection {
|
|
||||||
public:
|
|
||||||
virtual int Enable(bool enable) = 0;
|
|
||||||
virtual bool is_enabled() const = 0;
|
|
||||||
|
|
||||||
// Returns true if voice is detected in the current frame. Should be called
|
|
||||||
// after |ProcessStream()|.
|
|
||||||
virtual bool stream_has_voice() const = 0;
|
|
||||||
|
|
||||||
// Some of the APM functionality requires a VAD decision. In the case that
|
|
||||||
// a decision is externally available for the current frame, it can be passed
|
|
||||||
// in here, before |ProcessStream()| is called.
|
|
||||||
//
|
|
||||||
// VoiceDetection does _not_ need to be enabled to use this. If it happens to
|
|
||||||
// be enabled, detection will be skipped for any frame in which an external
|
|
||||||
// VAD decision is provided.
|
|
||||||
virtual int set_stream_has_voice(bool has_voice) = 0;
|
|
||||||
|
|
||||||
// Specifies the likelihood that a frame will be declared to contain voice.
|
|
||||||
// A higher value makes it more likely that speech will not be clipped, at
|
|
||||||
// the expense of more noise being detected as voice.
|
|
||||||
enum Likelihood {
|
|
||||||
kVeryLowLikelihood,
|
|
||||||
kLowLikelihood,
|
|
||||||
kModerateLikelihood,
|
|
||||||
kHighLikelihood
|
|
||||||
};
|
|
||||||
|
|
||||||
virtual int set_likelihood(Likelihood likelihood) = 0;
|
|
||||||
virtual Likelihood likelihood() const = 0;
|
|
||||||
|
|
||||||
// Sets the |size| of the frames in ms on which the VAD will operate. Larger
|
|
||||||
// frames will improve detection accuracy, but reduce the frequency of
|
|
||||||
// updates.
|
|
||||||
//
|
|
||||||
// This does not impact the size of frames passed to |ProcessStream()|.
|
|
||||||
virtual int set_frame_size_ms(int size) = 0;
|
|
||||||
virtual int frame_size_ms() const = 0;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual ~VoiceDetection() {}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace webrtc
|
} // namespace webrtc
|
||||||
|
|
||||||
#endif // MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
|
#endif // MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
|
||||||
|
@ -91,26 +91,12 @@ class MockEchoControl : public EchoControl {
|
|||||||
MOCK_METHOD1(SetAudioBufferDelay, void(size_t delay_ms));
|
MOCK_METHOD1(SetAudioBufferDelay, void(size_t delay_ms));
|
||||||
};
|
};
|
||||||
|
|
||||||
class MockVoiceDetection : public VoiceDetection {
|
|
||||||
public:
|
|
||||||
virtual ~MockVoiceDetection() {}
|
|
||||||
MOCK_METHOD1(Enable, int(bool enable));
|
|
||||||
MOCK_CONST_METHOD0(is_enabled, bool());
|
|
||||||
MOCK_CONST_METHOD0(stream_has_voice, bool());
|
|
||||||
MOCK_METHOD1(set_stream_has_voice, int(bool has_voice));
|
|
||||||
MOCK_METHOD1(set_likelihood, int(Likelihood likelihood));
|
|
||||||
MOCK_CONST_METHOD0(likelihood, Likelihood());
|
|
||||||
MOCK_METHOD1(set_frame_size_ms, int(int size));
|
|
||||||
MOCK_CONST_METHOD0(frame_size_ms, int());
|
|
||||||
};
|
|
||||||
|
|
||||||
class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
|
class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
|
||||||
public:
|
public:
|
||||||
MockAudioProcessing()
|
MockAudioProcessing()
|
||||||
: gain_control_(new ::testing::NiceMock<MockGainControl>()),
|
: gain_control_(new ::testing::NiceMock<MockGainControl>()),
|
||||||
level_estimator_(new ::testing::NiceMock<MockLevelEstimator>()),
|
level_estimator_(new ::testing::NiceMock<MockLevelEstimator>()),
|
||||||
noise_suppression_(new ::testing::NiceMock<MockNoiseSuppression>()),
|
noise_suppression_(new ::testing::NiceMock<MockNoiseSuppression>()) {}
|
||||||
voice_detection_(new ::testing::NiceMock<MockVoiceDetection>()) {}
|
|
||||||
|
|
||||||
virtual ~MockAudioProcessing() {}
|
virtual ~MockAudioProcessing() {}
|
||||||
|
|
||||||
@ -183,9 +169,6 @@ class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
|
|||||||
virtual MockNoiseSuppression* noise_suppression() const {
|
virtual MockNoiseSuppression* noise_suppression() const {
|
||||||
return noise_suppression_.get();
|
return noise_suppression_.get();
|
||||||
}
|
}
|
||||||
virtual MockVoiceDetection* voice_detection() const {
|
|
||||||
return voice_detection_.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
MOCK_CONST_METHOD0(GetConfig, AudioProcessing::Config());
|
MOCK_CONST_METHOD0(GetConfig, AudioProcessing::Config());
|
||||||
|
|
||||||
@ -193,7 +176,6 @@ class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
|
|||||||
std::unique_ptr<MockGainControl> gain_control_;
|
std::unique_ptr<MockGainControl> gain_control_;
|
||||||
std::unique_ptr<MockLevelEstimator> level_estimator_;
|
std::unique_ptr<MockLevelEstimator> level_estimator_;
|
||||||
std::unique_ptr<MockNoiseSuppression> noise_suppression_;
|
std::unique_ptr<MockNoiseSuppression> noise_suppression_;
|
||||||
std::unique_ptr<MockVoiceDetection> voice_detection_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace test
|
} // namespace test
|
||||||
|
@ -455,6 +455,10 @@ void AudioProcessingSimulator::CreateAudioProcessor() {
|
|||||||
apm_config.high_pass_filter.enabled = *settings_.use_hpf;
|
apm_config.high_pass_filter.enabled = *settings_.use_hpf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (settings_.use_vad) {
|
||||||
|
apm_config.voice_detection.enabled = *settings_.use_vad;
|
||||||
|
}
|
||||||
|
|
||||||
if (settings_.use_refined_adaptive_filter) {
|
if (settings_.use_refined_adaptive_filter) {
|
||||||
config.Set<RefinedAdaptiveFilter>(
|
config.Set<RefinedAdaptiveFilter>(
|
||||||
new RefinedAdaptiveFilter(*settings_.use_refined_adaptive_filter));
|
new RefinedAdaptiveFilter(*settings_.use_refined_adaptive_filter));
|
||||||
@ -502,10 +506,6 @@ void AudioProcessingSimulator::CreateAudioProcessor() {
|
|||||||
RTC_CHECK_EQ(AudioProcessing::kNoError,
|
RTC_CHECK_EQ(AudioProcessing::kNoError,
|
||||||
ap_->level_estimator()->Enable(*settings_.use_le));
|
ap_->level_estimator()->Enable(*settings_.use_le));
|
||||||
}
|
}
|
||||||
if (settings_.use_vad) {
|
|
||||||
RTC_CHECK_EQ(AudioProcessing::kNoError,
|
|
||||||
ap_->voice_detection()->Enable(*settings_.use_vad));
|
|
||||||
}
|
|
||||||
if (settings_.use_agc_limiter) {
|
if (settings_.use_agc_limiter) {
|
||||||
RTC_CHECK_EQ(AudioProcessing::kNoError, ap_->gain_control()->enable_limiter(
|
RTC_CHECK_EQ(AudioProcessing::kNoError, ap_->gain_control()->enable_limiter(
|
||||||
*settings_.use_agc_limiter));
|
*settings_.use_agc_limiter));
|
||||||
@ -526,13 +526,6 @@ void AudioProcessingSimulator::CreateAudioProcessor() {
|
|||||||
ap_->gain_control()->set_mode(
|
ap_->gain_control()->set_mode(
|
||||||
static_cast<webrtc::GainControl::Mode>(*settings_.agc_mode)));
|
static_cast<webrtc::GainControl::Mode>(*settings_.agc_mode)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (settings_.vad_likelihood) {
|
|
||||||
RTC_CHECK_EQ(AudioProcessing::kNoError,
|
|
||||||
ap_->voice_detection()->set_likelihood(
|
|
||||||
static_cast<webrtc::VoiceDetection::Likelihood>(
|
|
||||||
*settings_.vad_likelihood)));
|
|
||||||
}
|
|
||||||
if (settings_.ns_level) {
|
if (settings_.ns_level) {
|
||||||
RTC_CHECK_EQ(
|
RTC_CHECK_EQ(
|
||||||
AudioProcessing::kNoError,
|
AudioProcessing::kNoError,
|
||||||
|
@ -79,7 +79,6 @@ struct SimulationSettings {
|
|||||||
AudioProcessing::Config::GainController2::LevelEstimator
|
AudioProcessing::Config::GainController2::LevelEstimator
|
||||||
agc2_adaptive_level_estimator;
|
agc2_adaptive_level_estimator;
|
||||||
absl::optional<float> pre_amplifier_gain_factor;
|
absl::optional<float> pre_amplifier_gain_factor;
|
||||||
absl::optional<int> vad_likelihood;
|
|
||||||
absl::optional<int> ns_level;
|
absl::optional<int> ns_level;
|
||||||
absl::optional<int> maximum_internal_processing_rate;
|
absl::optional<int> maximum_internal_processing_rate;
|
||||||
absl::optional<bool> use_refined_adaptive_filter;
|
absl::optional<bool> use_refined_adaptive_filter;
|
||||||
|
@ -185,10 +185,6 @@ ABSL_FLAG(float,
|
|||||||
pre_amplifier_gain_factor,
|
pre_amplifier_gain_factor,
|
||||||
kParameterNotSpecifiedValue,
|
kParameterNotSpecifiedValue,
|
||||||
"Pre-amplifier gain factor (linear) to apply");
|
"Pre-amplifier gain factor (linear) to apply");
|
||||||
ABSL_FLAG(int,
|
|
||||||
vad_likelihood,
|
|
||||||
kParameterNotSpecifiedValue,
|
|
||||||
"Specify the VAD likelihood (0-3)");
|
|
||||||
ABSL_FLAG(int,
|
ABSL_FLAG(int,
|
||||||
ns_level,
|
ns_level,
|
||||||
kParameterNotSpecifiedValue,
|
kParameterNotSpecifiedValue,
|
||||||
@ -423,8 +419,6 @@ SimulationSettings CreateSettings() {
|
|||||||
absl::GetFlag(FLAGS_agc2_adaptive_level_estimator));
|
absl::GetFlag(FLAGS_agc2_adaptive_level_estimator));
|
||||||
SetSettingIfSpecified(absl::GetFlag(FLAGS_pre_amplifier_gain_factor),
|
SetSettingIfSpecified(absl::GetFlag(FLAGS_pre_amplifier_gain_factor),
|
||||||
&settings.pre_amplifier_gain_factor);
|
&settings.pre_amplifier_gain_factor);
|
||||||
SetSettingIfSpecified(absl::GetFlag(FLAGS_vad_likelihood),
|
|
||||||
&settings.vad_likelihood);
|
|
||||||
SetSettingIfSpecified(absl::GetFlag(FLAGS_ns_level), &settings.ns_level);
|
SetSettingIfSpecified(absl::GetFlag(FLAGS_ns_level), &settings.ns_level);
|
||||||
SetSettingIfSpecified(absl::GetFlag(FLAGS_maximum_internal_processing_rate),
|
SetSettingIfSpecified(absl::GetFlag(FLAGS_maximum_internal_processing_rate),
|
||||||
&settings.maximum_internal_processing_rate);
|
&settings.maximum_internal_processing_rate);
|
||||||
@ -555,11 +549,6 @@ void PerformBasicParameterSanityChecks(const SimulationSettings& settings) {
|
|||||||
(*settings.agc2_fixed_gain_db) > 90),
|
(*settings.agc2_fixed_gain_db) > 90),
|
||||||
"Error: --agc2_fixed_gain_db must be specified between 0 and 90.\n");
|
"Error: --agc2_fixed_gain_db must be specified between 0 and 90.\n");
|
||||||
|
|
||||||
ReportConditionalErrorAndExit(
|
|
||||||
settings.vad_likelihood &&
|
|
||||||
((*settings.vad_likelihood) < 0 || (*settings.vad_likelihood) > 3),
|
|
||||||
"Error: --vad_likelihood must be specified between 0 and 3.\n");
|
|
||||||
|
|
||||||
ReportConditionalErrorAndExit(
|
ReportConditionalErrorAndExit(
|
||||||
settings.ns_level &&
|
settings.ns_level &&
|
||||||
((*settings.ns_level) < 0 || (*settings.ns_level) > 3),
|
((*settings.ns_level) < 0 || (*settings.ns_level) > 3),
|
||||||
|
93
modules/audio_processing/voice_detection.cc
Normal file
93
modules/audio_processing/voice_detection.cc
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "modules/audio_processing/voice_detection.h"
|
||||||
|
|
||||||
|
#include "api/audio/audio_frame.h"
|
||||||
|
#include "common_audio/vad/include/webrtc_vad.h"
|
||||||
|
#include "modules/audio_processing/audio_buffer.h"
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
class VoiceDetection::Vad {
|
||||||
|
public:
|
||||||
|
Vad() {
|
||||||
|
state_ = WebRtcVad_Create();
|
||||||
|
RTC_CHECK(state_);
|
||||||
|
int error = WebRtcVad_Init(state_);
|
||||||
|
RTC_DCHECK_EQ(0, error);
|
||||||
|
}
|
||||||
|
~Vad() { WebRtcVad_Free(state_); }
|
||||||
|
|
||||||
|
Vad(Vad&) = delete;
|
||||||
|
Vad& operator=(Vad&) = delete;
|
||||||
|
|
||||||
|
VadInst* state() { return state_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
VadInst* state_ = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
|
||||||
|
: sample_rate_hz_(sample_rate_hz),
|
||||||
|
frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
|
||||||
|
likelihood_(likelihood),
|
||||||
|
vad_(new Vad()) {
|
||||||
|
int mode = 2;
|
||||||
|
switch (likelihood) {
|
||||||
|
case VoiceDetection::kVeryLowLikelihood:
|
||||||
|
mode = 3;
|
||||||
|
break;
|
||||||
|
case VoiceDetection::kLowLikelihood:
|
||||||
|
mode = 2;
|
||||||
|
break;
|
||||||
|
case VoiceDetection::kModerateLikelihood:
|
||||||
|
mode = 1;
|
||||||
|
break;
|
||||||
|
case VoiceDetection::kHighLikelihood:
|
||||||
|
mode = 0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
RTC_NOTREACHED();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int error = WebRtcVad_set_mode(vad_->state(), mode);
|
||||||
|
RTC_DCHECK_EQ(0, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
VoiceDetection::~VoiceDetection() {}
|
||||||
|
|
||||||
|
bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
|
||||||
|
RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
|
||||||
|
audio->num_frames_per_band());
|
||||||
|
std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
|
||||||
|
rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
|
||||||
|
audio->num_frames_per_band());
|
||||||
|
if (audio->num_channels() == 1) {
|
||||||
|
FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
|
||||||
|
audio->num_frames_per_band(), mixed_low_pass_data.data());
|
||||||
|
} else {
|
||||||
|
const int num_channels = static_cast<int>(audio->num_channels());
|
||||||
|
for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
|
||||||
|
int32_t value =
|
||||||
|
FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
|
||||||
|
for (int j = 1; j < num_channels; ++j) {
|
||||||
|
value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
|
||||||
|
}
|
||||||
|
mixed_low_pass_data[i] = value / num_channels;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
|
||||||
|
mixed_low_pass.data(), frame_size_samples_);
|
||||||
|
RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
|
||||||
|
return vad_ret == 0 ? false : true;
|
||||||
|
}
|
||||||
|
} // namespace webrtc
|
59
modules/audio_processing/voice_detection.h
Normal file
59
modules/audio_processing/voice_detection.h
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
|
||||||
|
#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "modules/audio_processing/include/audio_processing.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
class AudioBuffer;
|
||||||
|
|
||||||
|
// The voice activity detection (VAD) component analyzes the stream to
|
||||||
|
// determine if voice is present.
|
||||||
|
class VoiceDetection {
|
||||||
|
public:
|
||||||
|
// Specifies the likelihood that a frame will be declared to contain voice.
|
||||||
|
// A higher value makes it more likely that speech will not be clipped, at
|
||||||
|
// the expense of more noise being detected as voice.
|
||||||
|
enum Likelihood {
|
||||||
|
kVeryLowLikelihood,
|
||||||
|
kLowLikelihood,
|
||||||
|
kModerateLikelihood,
|
||||||
|
kHighLikelihood
|
||||||
|
};
|
||||||
|
|
||||||
|
VoiceDetection(int sample_rate_hz, Likelihood likelihood);
|
||||||
|
~VoiceDetection();
|
||||||
|
|
||||||
|
VoiceDetection(VoiceDetection&) = delete;
|
||||||
|
VoiceDetection& operator=(VoiceDetection&) = delete;
|
||||||
|
|
||||||
|
// Returns true if voice is detected in the current frame.
|
||||||
|
bool ProcessCaptureAudio(AudioBuffer* audio);
|
||||||
|
|
||||||
|
Likelihood likelihood() const { return likelihood_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
class Vad;
|
||||||
|
|
||||||
|
int sample_rate_hz_;
|
||||||
|
size_t frame_size_samples_;
|
||||||
|
Likelihood likelihood_;
|
||||||
|
std::unique_ptr<Vad> vad_;
|
||||||
|
};
|
||||||
|
} // namespace webrtc
|
||||||
|
|
||||||
|
#endif // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
|
@ -1,168 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "modules/audio_processing/voice_detection_impl.h"
|
|
||||||
|
|
||||||
#include "api/audio/audio_frame.h"
|
|
||||||
#include "common_audio/vad/include/webrtc_vad.h"
|
|
||||||
#include "modules/audio_processing/audio_buffer.h"
|
|
||||||
#include "rtc_base/checks.h"
|
|
||||||
#include "rtc_base/constructor_magic.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
class VoiceDetectionImpl::Vad {
|
|
||||||
public:
|
|
||||||
Vad() {
|
|
||||||
state_ = WebRtcVad_Create();
|
|
||||||
RTC_CHECK(state_);
|
|
||||||
int error = WebRtcVad_Init(state_);
|
|
||||||
RTC_DCHECK_EQ(0, error);
|
|
||||||
}
|
|
||||||
~Vad() { WebRtcVad_Free(state_); }
|
|
||||||
VadInst* state() { return state_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
VadInst* state_ = nullptr;
|
|
||||||
RTC_DISALLOW_COPY_AND_ASSIGN(Vad);
|
|
||||||
};
|
|
||||||
|
|
||||||
VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit)
|
|
||||||
: crit_(crit) {
|
|
||||||
RTC_DCHECK(crit);
|
|
||||||
}
|
|
||||||
|
|
||||||
VoiceDetectionImpl::~VoiceDetectionImpl() {}
|
|
||||||
|
|
||||||
void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
sample_rate_hz_ = sample_rate_hz;
|
|
||||||
std::unique_ptr<Vad> new_vad;
|
|
||||||
if (enabled_) {
|
|
||||||
new_vad.reset(new Vad());
|
|
||||||
}
|
|
||||||
vad_.swap(new_vad);
|
|
||||||
using_external_vad_ = false;
|
|
||||||
frame_size_samples_ =
|
|
||||||
static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000;
|
|
||||||
set_likelihood(likelihood_);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
RTC_DCHECK(enabled_);
|
|
||||||
|
|
||||||
RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
|
|
||||||
audio->num_frames_per_band());
|
|
||||||
std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
|
|
||||||
rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
|
|
||||||
audio->num_frames_per_band());
|
|
||||||
if (audio->num_channels() == 1) {
|
|
||||||
FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
|
|
||||||
audio->num_frames_per_band(), mixed_low_pass_data.data());
|
|
||||||
} else {
|
|
||||||
const int num_channels = static_cast<int>(audio->num_channels());
|
|
||||||
for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
|
|
||||||
int32_t value =
|
|
||||||
FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
|
|
||||||
for (int j = 1; j < num_channels; ++j) {
|
|
||||||
value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
|
|
||||||
}
|
|
||||||
mixed_low_pass_data[i] = value / num_channels;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
|
|
||||||
mixed_low_pass.data(), frame_size_samples_);
|
|
||||||
if (vad_ret == 0) {
|
|
||||||
stream_has_voice_ = false;
|
|
||||||
return false;
|
|
||||||
} else if (vad_ret == 1) {
|
|
||||||
stream_has_voice_ = true;
|
|
||||||
} else {
|
|
||||||
RTC_NOTREACHED();
|
|
||||||
}
|
|
||||||
|
|
||||||
return stream_has_voice_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VoiceDetectionImpl::Enable(bool enable) {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
if (enabled_ != enable) {
|
|
||||||
enabled_ = enable;
|
|
||||||
Initialize(sample_rate_hz_);
|
|
||||||
}
|
|
||||||
return AudioProcessing::kNoError;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool VoiceDetectionImpl::is_enabled() const {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
return enabled_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
using_external_vad_ = true;
|
|
||||||
stream_has_voice_ = has_voice;
|
|
||||||
return AudioProcessing::kNoError;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool VoiceDetectionImpl::stream_has_voice() const {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
// TODO(ajm): enable this assertion?
|
|
||||||
// RTC_DCHECK(using_external_vad_ || is_component_enabled());
|
|
||||||
return stream_has_voice_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
likelihood_ = likelihood;
|
|
||||||
if (enabled_) {
|
|
||||||
int mode = 2;
|
|
||||||
switch (likelihood) {
|
|
||||||
case VoiceDetection::kVeryLowLikelihood:
|
|
||||||
mode = 3;
|
|
||||||
break;
|
|
||||||
case VoiceDetection::kLowLikelihood:
|
|
||||||
mode = 2;
|
|
||||||
break;
|
|
||||||
case VoiceDetection::kModerateLikelihood:
|
|
||||||
mode = 1;
|
|
||||||
break;
|
|
||||||
case VoiceDetection::kHighLikelihood:
|
|
||||||
mode = 0;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
RTC_NOTREACHED();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
int error = WebRtcVad_set_mode(vad_->state(), mode);
|
|
||||||
RTC_DCHECK_EQ(0, error);
|
|
||||||
}
|
|
||||||
return AudioProcessing::kNoError;
|
|
||||||
}
|
|
||||||
|
|
||||||
VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
return likelihood_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VoiceDetectionImpl::set_frame_size_ms(int size) {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
RTC_DCHECK_EQ(10, size); // TODO(ajm): remove when supported.
|
|
||||||
frame_size_ms_ = size;
|
|
||||||
Initialize(sample_rate_hz_);
|
|
||||||
return AudioProcessing::kNoError;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VoiceDetectionImpl::frame_size_ms() const {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
return frame_size_ms_;
|
|
||||||
}
|
|
||||||
} // namespace webrtc
|
|
@ -1,69 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_IMPL_H_
|
|
||||||
#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_IMPL_H_
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#include "modules/audio_processing/include/audio_processing.h"
|
|
||||||
#include "rtc_base/constructor_magic.h"
|
|
||||||
#include "rtc_base/critical_section.h"
|
|
||||||
#include "rtc_base/thread_annotations.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
|
|
||||||
class AudioBuffer;
|
|
||||||
|
|
||||||
class VoiceDetectionImpl : public VoiceDetection {
|
|
||||||
public:
|
|
||||||
explicit VoiceDetectionImpl(rtc::CriticalSection* crit);
|
|
||||||
~VoiceDetectionImpl() override;
|
|
||||||
|
|
||||||
// TODO(peah): Fold into ctor, once public API is removed.
|
|
||||||
void Initialize(int sample_rate_hz);
|
|
||||||
|
|
||||||
// Returns the VAD activity.
|
|
||||||
bool ProcessCaptureAudio(AudioBuffer* audio);
|
|
||||||
|
|
||||||
bool using_external_vad() const {
|
|
||||||
rtc::CritScope cs(crit_);
|
|
||||||
return using_external_vad_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// VoiceDetection implementation.
|
|
||||||
int Enable(bool enable) override;
|
|
||||||
bool is_enabled() const override;
|
|
||||||
int set_stream_has_voice(bool has_voice) override;
|
|
||||||
bool stream_has_voice() const override;
|
|
||||||
int set_likelihood(Likelihood likelihood) override;
|
|
||||||
Likelihood likelihood() const override;
|
|
||||||
int set_frame_size_ms(int size) override;
|
|
||||||
int frame_size_ms() const override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
class Vad;
|
|
||||||
|
|
||||||
rtc::CriticalSection* const crit_;
|
|
||||||
bool enabled_ RTC_GUARDED_BY(crit_) = false;
|
|
||||||
bool stream_has_voice_ RTC_GUARDED_BY(crit_) = false;
|
|
||||||
bool using_external_vad_ RTC_GUARDED_BY(crit_) = false;
|
|
||||||
Likelihood likelihood_ RTC_GUARDED_BY(crit_) = kLowLikelihood;
|
|
||||||
int frame_size_ms_ RTC_GUARDED_BY(crit_) = 10;
|
|
||||||
size_t frame_size_samples_ RTC_GUARDED_BY(crit_) = 0;
|
|
||||||
int sample_rate_hz_ RTC_GUARDED_BY(crit_) = 0;
|
|
||||||
std::unique_ptr<Vad> vad_ RTC_GUARDED_BY(crit_);
|
|
||||||
RTC_DISALLOW_IMPLICIT_CONSTRUCTORS(VoiceDetectionImpl);
|
|
||||||
};
|
|
||||||
} // namespace webrtc
|
|
||||||
|
|
||||||
#endif // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_IMPL_H_
|
|
@ -13,7 +13,7 @@
|
|||||||
#include "modules/audio_processing/audio_buffer.h"
|
#include "modules/audio_processing/audio_buffer.h"
|
||||||
#include "modules/audio_processing/test/audio_buffer_tools.h"
|
#include "modules/audio_processing/test/audio_buffer_tools.h"
|
||||||
#include "modules/audio_processing/test/bitexactness_tools.h"
|
#include "modules/audio_processing/test/bitexactness_tools.h"
|
||||||
#include "modules/audio_processing/voice_detection_impl.h"
|
#include "modules/audio_processing/voice_detection.h"
|
||||||
#include "test/gtest.h"
|
#include "test/gtest.h"
|
||||||
|
|
||||||
namespace webrtc {
|
namespace webrtc {
|
||||||
@ -22,27 +22,24 @@ namespace {
|
|||||||
const int kNumFramesToProcess = 1000;
|
const int kNumFramesToProcess = 1000;
|
||||||
|
|
||||||
// Process one frame of data and produce the output.
|
// Process one frame of data and produce the output.
|
||||||
void ProcessOneFrame(int sample_rate_hz,
|
bool ProcessOneFrame(int sample_rate_hz,
|
||||||
AudioBuffer* audio_buffer,
|
AudioBuffer* audio_buffer,
|
||||||
VoiceDetectionImpl* voice_detection) {
|
VoiceDetection* voice_detection) {
|
||||||
if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
|
if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
|
||||||
audio_buffer->SplitIntoFrequencyBands();
|
audio_buffer->SplitIntoFrequencyBands();
|
||||||
}
|
}
|
||||||
|
|
||||||
voice_detection->ProcessCaptureAudio(audio_buffer);
|
return voice_detection->ProcessCaptureAudio(audio_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Processes a specified amount of frames, verifies the results and reports
|
// Processes a specified amount of frames, verifies the results and reports
|
||||||
// any errors.
|
// any errors.
|
||||||
void RunBitexactnessTest(int sample_rate_hz,
|
void RunBitexactnessTest(int sample_rate_hz,
|
||||||
size_t num_channels,
|
size_t num_channels,
|
||||||
int frame_size_ms_reference,
|
bool stream_has_voice_reference) {
|
||||||
bool stream_has_voice_reference,
|
int sample_rate_to_use = std::min(sample_rate_hz, 16000);
|
||||||
VoiceDetection::Likelihood likelihood_reference) {
|
VoiceDetection voice_detection(sample_rate_to_use,
|
||||||
rtc::CriticalSection crit_capture;
|
VoiceDetection::kLowLikelihood);
|
||||||
VoiceDetectionImpl voice_detection(&crit_capture);
|
|
||||||
voice_detection.Initialize(sample_rate_hz > 16000 ? 16000 : sample_rate_hz);
|
|
||||||
voice_detection.Enable(true);
|
|
||||||
|
|
||||||
int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
|
int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
|
||||||
const StreamConfig capture_config(sample_rate_hz, num_channels, false);
|
const StreamConfig capture_config(sample_rate_hz, num_channels, false);
|
||||||
@ -53,6 +50,7 @@ void RunBitexactnessTest(int sample_rate_hz,
|
|||||||
test::InputAudioFile capture_file(
|
test::InputAudioFile capture_file(
|
||||||
test::GetApmCaptureTestVectorFileName(sample_rate_hz));
|
test::GetApmCaptureTestVectorFileName(sample_rate_hz));
|
||||||
std::vector<float> capture_input(samples_per_channel * num_channels);
|
std::vector<float> capture_input(samples_per_channel * num_channels);
|
||||||
|
bool stream_has_voice = false;
|
||||||
for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
|
for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
|
||||||
ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
|
ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
|
||||||
&capture_file, capture_input);
|
&capture_file, capture_input);
|
||||||
@ -60,64 +58,47 @@ void RunBitexactnessTest(int sample_rate_hz,
|
|||||||
test::CopyVectorToAudioBuffer(capture_config, capture_input,
|
test::CopyVectorToAudioBuffer(capture_config, capture_input,
|
||||||
&capture_buffer);
|
&capture_buffer);
|
||||||
|
|
||||||
ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
|
stream_has_voice =
|
||||||
|
ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
|
||||||
}
|
}
|
||||||
|
|
||||||
int frame_size_ms = voice_detection.frame_size_ms();
|
|
||||||
bool stream_has_voice = voice_detection.stream_has_voice();
|
|
||||||
VoiceDetection::Likelihood likelihood = voice_detection.likelihood();
|
|
||||||
|
|
||||||
// Compare the outputs to the references.
|
|
||||||
EXPECT_EQ(frame_size_ms_reference, frame_size_ms);
|
|
||||||
EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
|
EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
|
||||||
EXPECT_EQ(likelihood_reference, likelihood);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int kFrameSizeMsReference = 10;
|
|
||||||
const bool kStreamHasVoiceReference = true;
|
const bool kStreamHasVoiceReference = true;
|
||||||
const VoiceDetection::Likelihood kLikelihoodReference =
|
|
||||||
VoiceDetection::kLowLikelihood;
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
|
||||||
RunBitexactnessTest(8000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
|
||||||
RunBitexactnessTest(16000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
|
||||||
RunBitexactnessTest(32000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
|
||||||
RunBitexactnessTest(48000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
|
||||||
RunBitexactnessTest(8000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
|
||||||
RunBitexactnessTest(16000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
|
||||||
RunBitexactnessTest(32000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
|
TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
|
||||||
RunBitexactnessTest(48000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
|
RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
|
||||||
kLikelihoodReference);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace webrtc
|
} // namespace webrtc
|
||||||
|
@ -1 +1 @@
|
|||||||
e540fa8940b41d0cda26cdef937be3a455a04be7
|
e9569d846d21e027bfdcae76a40146bc10d49d54
|
@ -1 +1 @@
|
|||||||
2811f534082857ac9b9447a3e53028ef11851052
|
53dd63154cc2694a3425596d9a8300fa2c66215d
|
@ -1 +1 @@
|
|||||||
cc82c345f1e7ef17b12c2da41a0a9f73b09ca8f6
|
2b31852bbce2b0b19ee36c47b18352e035cb08c5
|
@ -145,7 +145,6 @@ std::unique_ptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
|
|||||||
apm->ApplyConfig(apm_config);
|
apm->ApplyConfig(apm_config);
|
||||||
|
|
||||||
apm->level_estimator()->Enable(use_le);
|
apm->level_estimator()->Enable(use_le);
|
||||||
apm->voice_detection()->Enable(use_vad);
|
|
||||||
|
|
||||||
return apm;
|
return apm;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user