AudioProcessingImpl: Add a VAD submodule

Add a VoiceActivityDetectorWrapper submodule in AudioProcessingImpl
and enable injecting speech probability into GainController2.

Bug: webrtc:13663
Change-Id: I05e13b737d085b45ac8ce76660191867c56834c2
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265166
Commit-Queue: Hanna Silen <silen@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#37275}
This commit is contained in:
Hanna Silen
2022-06-16 16:35:45 +02:00
committed by WebRTC LUCI CQ
parent ff45105b42
commit 0c1ad2992b
7 changed files with 348 additions and 17 deletions

View File

@ -47,7 +47,7 @@ float RunAgc2WithConstantInput(GainController2& agc2,
// Give time to the level estimator to converge.
for (int i = 0; i < num_frames + 1; ++i) {
SetAudioBufferSamples(input_level, ab);
agc2.Process(&ab);
agc2.Process(/*speech_probability=*/absl::nullopt, &ab);
}
// Return the last sample from the last processed frame.
@ -62,7 +62,8 @@ std::unique_ptr<GainController2> CreateAgc2FixedDigitalMode(
config.fixed_digital.gain_db = fixed_gain_db;
EXPECT_TRUE(GainController2::Validate(config));
return std::make_unique<GainController2>(config, sample_rate_hz,
/*num_channels=*/1);
/*num_channels=*/1,
/*use_internal_vad=*/true);
}
} // namespace
@ -138,7 +139,8 @@ TEST(GainController2, CheckAdaptiveDigitalMaxOutputNoiseLevelConfig) {
// Checks that the default config is applied.
TEST(GainController2, ApplyDefaultConfig) {
auto gain_controller2 = std::make_unique<GainController2>(
Agc2Config{}, /*sample_rate_hz=*/16000, /*num_channels=*/2);
Agc2Config{}, /*sample_rate_hz=*/16000, /*num_channels=*/2,
/*use_internal_vad=*/true);
EXPECT_TRUE(gain_controller2.get());
}
@ -253,7 +255,8 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
Agc2Config config;
config.fixed_digital.gain_db = 0.0f;
config.adaptive_digital.enabled = true;
GainController2 agc2(config, kSampleRateHz, kStereo);
GainController2 agc2(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/true);
test::InputAudioFile input_file(
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
@ -276,16 +279,16 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
stream_config.num_channels(), &input_file,
frame);
// Apply a fixed gain to the input audio.
for (float& x : frame)
for (float& x : frame) {
x *= gain;
}
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
// Process.
agc2.Process(&audio_buffer);
agc2.Process(/*speech_probability=*/absl::nullopt, &audio_buffer);
}
// Estimate the applied gain by processing a probing frame.
SetAudioBufferSamples(/*value=*/1.0f, audio_buffer);
agc2.Process(&audio_buffer);
agc2.Process(/*speech_probability=*/absl::nullopt, &audio_buffer);
const float applied_gain_db =
20.0f * std::log10(audio_buffer.channels_const()[0][0]);
@ -294,5 +297,196 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
EXPECT_NEAR(applied_gain_db, kExpectedGainDb, kToleranceDb);
}
// Processes a test audio file and checks that the injected speech probability
// is ignored when the internal VAD is used.
TEST(GainController2,
CheckInjectedVadProbabilityNotUsedWithAdaptiveDigitalController) {
constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
constexpr int kStereo = 2;
// Create AGC2 enabling only the adaptive digital controller.
Agc2Config config;
config.fixed_digital.gain_db = 0.0f;
config.adaptive_digital.enabled = true;
GainController2 agc2(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/true);
GainController2 agc2_reference(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/true);
test::InputAudioFile input_file(
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
/*loop_at_end=*/true);
const StreamConfig stream_config(kSampleRateHz, kStereo);
// Init buffers.
constexpr int kFrameDurationMs = 10;
std::vector<float> frame(kStereo * stream_config.num_frames());
AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
kSampleRateHz, kStereo);
AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
kStereo, kSampleRateHz, kStereo);
// Simulate.
constexpr float kGainDb = -6.0f;
const float gain = std::pow(10.0f, kGainDb / 20.0f);
constexpr int kDurationMs = 10000;
constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
constexpr float kSpeechProbabilities[] = {1.0f, 0.3f};
constexpr float kEpsilon = 0.0001f;
bool all_samples_zero = true;
for (int i = 0, j = 0; i < kNumFramesToProcess; ++i, j = 1 - j) {
ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
stream_config.num_channels(), &input_file,
frame);
// Apply a fixed gain to the input audio.
for (float& x : frame) {
x *= gain;
}
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
agc2.Process(kSpeechProbabilities[j], &audio_buffer);
test::CopyVectorToAudioBuffer(stream_config, frame,
&audio_buffer_reference);
agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
// Check the output buffers.
for (int i = 0; i < kStereo; ++i) {
for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
all_samples_zero &=
fabs(audio_buffer.channels_const()[i][j]) < kEpsilon;
EXPECT_FLOAT_EQ(audio_buffer.channels_const()[i][j],
audio_buffer_reference.channels_const()[i][j]);
}
}
}
EXPECT_FALSE(all_samples_zero);
}
// Processes a test audio file and checks that the injected speech probability
// is not ignored when the internal VAD is not used.
TEST(GainController2,
CheckInjectedVadProbabilityUsedWithAdaptiveDigitalController) {
constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
constexpr int kStereo = 2;
// Create AGC2 enabling only the adaptive digital controller.
Agc2Config config;
config.fixed_digital.gain_db = 0.0f;
config.adaptive_digital.enabled = true;
GainController2 agc2(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/false);
GainController2 agc2_reference(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/true);
test::InputAudioFile input_file(
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
/*loop_at_end=*/true);
const StreamConfig stream_config(kSampleRateHz, kStereo);
// Init buffers.
constexpr int kFrameDurationMs = 10;
std::vector<float> frame(kStereo * stream_config.num_frames());
AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
kSampleRateHz, kStereo);
AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
kStereo, kSampleRateHz, kStereo);
// Simulate.
constexpr float kGainDb = -6.0f;
const float gain = std::pow(10.0f, kGainDb / 20.0f);
constexpr int kDurationMs = 10000;
constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
constexpr float kSpeechProbabilities[] = {1.0f, 0.3f};
constexpr float kEpsilon = 0.0001f;
bool all_samples_zero = true;
bool all_samples_equal = true;
for (int i = 0, j = 0; i < kNumFramesToProcess; ++i, j = 1 - j) {
ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
stream_config.num_channels(), &input_file,
frame);
// Apply a fixed gain to the input audio.
for (float& x : frame) {
x *= gain;
}
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
agc2.Process(kSpeechProbabilities[j], &audio_buffer);
test::CopyVectorToAudioBuffer(stream_config, frame,
&audio_buffer_reference);
agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
// Check the output buffers.
for (int i = 0; i < kStereo; ++i) {
for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
all_samples_zero &=
fabs(audio_buffer.channels_const()[i][j]) < kEpsilon;
all_samples_equal &=
fabs(audio_buffer.channels_const()[i][j] -
audio_buffer_reference.channels_const()[i][j]) < kEpsilon;
}
}
}
EXPECT_FALSE(all_samples_zero);
EXPECT_FALSE(all_samples_equal);
}
// Processes a test audio file and checks that the output is equal when
// an injected speech probability from `VoiceActivityDetectorWrapper` and
// the speech probability computed by the internal VAD are the same.
TEST(GainController2,
CheckEqualResultFromInjectedVadProbabilityWithAdaptiveDigitalController) {
constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
constexpr int kStereo = 2;
// Create AGC2 enabling only the adaptive digital controller.
Agc2Config config;
config.fixed_digital.gain_db = 0.0f;
config.adaptive_digital.enabled = true;
GainController2 agc2(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/false);
GainController2 agc2_reference(config, kSampleRateHz, kStereo,
/*use_internal_vad=*/true);
VoiceActivityDetectorWrapper vad(config.adaptive_digital.vad_reset_period_ms,
GetAvailableCpuFeatures(), kSampleRateHz);
test::InputAudioFile input_file(
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
/*loop_at_end=*/true);
const StreamConfig stream_config(kSampleRateHz, kStereo);
// Init buffers.
constexpr int kFrameDurationMs = 10;
std::vector<float> frame(kStereo * stream_config.num_frames());
AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
kSampleRateHz, kStereo);
AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
kStereo, kSampleRateHz, kStereo);
// Simulate.
constexpr float kGainDb = -6.0f;
const float gain = std::pow(10.0f, kGainDb / 20.0f);
constexpr int kDurationMs = 10000;
constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
for (int i = 0; i < kNumFramesToProcess; ++i) {
ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
stream_config.num_channels(), &input_file,
frame);
// Apply a fixed gain to the input audio.
for (float& x : frame) {
x *= gain;
}
test::CopyVectorToAudioBuffer(stream_config, frame,
&audio_buffer_reference);
agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
agc2.Process(vad.Analyze(AudioFrameView<const float>(
audio_buffer.channels(), audio_buffer.num_channels(),
audio_buffer.num_frames())),
&audio_buffer);
// Check the output buffer.
for (int i = 0; i < kStereo; ++i) {
for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
EXPECT_FLOAT_EQ(audio_buffer.channels_const()[i][j],
audio_buffer_reference.channels_const()[i][j]);
}
}
}
}
} // namespace test
} // namespace webrtc