AudioProcessingImpl: Add a VAD submodule
Add a VoiceActivityDetectorWrapper submodule in AudioProcessingImpl and enable injecting speech probability into GainController2. Bug: webrtc:13663 Change-Id: I05e13b737d085b45ac8ce76660191867c56834c2 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265166 Commit-Queue: Hanna Silen <silen@webrtc.org> Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/main@{#37275}
This commit is contained in:
committed by
WebRTC LUCI CQ
parent
ff45105b42
commit
0c1ad2992b
@ -47,7 +47,7 @@ float RunAgc2WithConstantInput(GainController2& agc2,
|
||||
// Give time to the level estimator to converge.
|
||||
for (int i = 0; i < num_frames + 1; ++i) {
|
||||
SetAudioBufferSamples(input_level, ab);
|
||||
agc2.Process(&ab);
|
||||
agc2.Process(/*speech_probability=*/absl::nullopt, &ab);
|
||||
}
|
||||
|
||||
// Return the last sample from the last processed frame.
|
||||
@ -62,7 +62,8 @@ std::unique_ptr<GainController2> CreateAgc2FixedDigitalMode(
|
||||
config.fixed_digital.gain_db = fixed_gain_db;
|
||||
EXPECT_TRUE(GainController2::Validate(config));
|
||||
return std::make_unique<GainController2>(config, sample_rate_hz,
|
||||
/*num_channels=*/1);
|
||||
/*num_channels=*/1,
|
||||
/*use_internal_vad=*/true);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -138,7 +139,8 @@ TEST(GainController2, CheckAdaptiveDigitalMaxOutputNoiseLevelConfig) {
|
||||
// Checks that the default config is applied.
|
||||
TEST(GainController2, ApplyDefaultConfig) {
|
||||
auto gain_controller2 = std::make_unique<GainController2>(
|
||||
Agc2Config{}, /*sample_rate_hz=*/16000, /*num_channels=*/2);
|
||||
Agc2Config{}, /*sample_rate_hz=*/16000, /*num_channels=*/2,
|
||||
/*use_internal_vad=*/true);
|
||||
EXPECT_TRUE(gain_controller2.get());
|
||||
}
|
||||
|
||||
@ -253,7 +255,8 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
|
||||
Agc2Config config;
|
||||
config.fixed_digital.gain_db = 0.0f;
|
||||
config.adaptive_digital.enabled = true;
|
||||
GainController2 agc2(config, kSampleRateHz, kStereo);
|
||||
GainController2 agc2(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/true);
|
||||
|
||||
test::InputAudioFile input_file(
|
||||
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
|
||||
@ -276,16 +279,16 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
|
||||
stream_config.num_channels(), &input_file,
|
||||
frame);
|
||||
// Apply a fixed gain to the input audio.
|
||||
for (float& x : frame)
|
||||
for (float& x : frame) {
|
||||
x *= gain;
|
||||
}
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
|
||||
// Process.
|
||||
agc2.Process(&audio_buffer);
|
||||
agc2.Process(/*speech_probability=*/absl::nullopt, &audio_buffer);
|
||||
}
|
||||
|
||||
// Estimate the applied gain by processing a probing frame.
|
||||
SetAudioBufferSamples(/*value=*/1.0f, audio_buffer);
|
||||
agc2.Process(&audio_buffer);
|
||||
agc2.Process(/*speech_probability=*/absl::nullopt, &audio_buffer);
|
||||
const float applied_gain_db =
|
||||
20.0f * std::log10(audio_buffer.channels_const()[0][0]);
|
||||
|
||||
@ -294,5 +297,196 @@ TEST(GainController2, CheckFinalGainWithAdaptiveDigitalController) {
|
||||
EXPECT_NEAR(applied_gain_db, kExpectedGainDb, kToleranceDb);
|
||||
}
|
||||
|
||||
// Processes a test audio file and checks that the injected speech probability
|
||||
// is ignored when the internal VAD is used.
|
||||
TEST(GainController2,
|
||||
CheckInjectedVadProbabilityNotUsedWithAdaptiveDigitalController) {
|
||||
constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
|
||||
constexpr int kStereo = 2;
|
||||
|
||||
// Create AGC2 enabling only the adaptive digital controller.
|
||||
Agc2Config config;
|
||||
config.fixed_digital.gain_db = 0.0f;
|
||||
config.adaptive_digital.enabled = true;
|
||||
GainController2 agc2(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/true);
|
||||
GainController2 agc2_reference(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/true);
|
||||
|
||||
test::InputAudioFile input_file(
|
||||
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
|
||||
/*loop_at_end=*/true);
|
||||
const StreamConfig stream_config(kSampleRateHz, kStereo);
|
||||
|
||||
// Init buffers.
|
||||
constexpr int kFrameDurationMs = 10;
|
||||
std::vector<float> frame(kStereo * stream_config.num_frames());
|
||||
AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
|
||||
kSampleRateHz, kStereo);
|
||||
AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
|
||||
kStereo, kSampleRateHz, kStereo);
|
||||
|
||||
// Simulate.
|
||||
constexpr float kGainDb = -6.0f;
|
||||
const float gain = std::pow(10.0f, kGainDb / 20.0f);
|
||||
constexpr int kDurationMs = 10000;
|
||||
constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
|
||||
constexpr float kSpeechProbabilities[] = {1.0f, 0.3f};
|
||||
constexpr float kEpsilon = 0.0001f;
|
||||
bool all_samples_zero = true;
|
||||
for (int i = 0, j = 0; i < kNumFramesToProcess; ++i, j = 1 - j) {
|
||||
ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
|
||||
stream_config.num_channels(), &input_file,
|
||||
frame);
|
||||
// Apply a fixed gain to the input audio.
|
||||
for (float& x : frame) {
|
||||
x *= gain;
|
||||
}
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
|
||||
agc2.Process(kSpeechProbabilities[j], &audio_buffer);
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame,
|
||||
&audio_buffer_reference);
|
||||
agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
|
||||
|
||||
// Check the output buffers.
|
||||
for (int i = 0; i < kStereo; ++i) {
|
||||
for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
|
||||
all_samples_zero &=
|
||||
fabs(audio_buffer.channels_const()[i][j]) < kEpsilon;
|
||||
EXPECT_FLOAT_EQ(audio_buffer.channels_const()[i][j],
|
||||
audio_buffer_reference.channels_const()[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPECT_FALSE(all_samples_zero);
|
||||
}
|
||||
|
||||
// Processes a test audio file and checks that the injected speech probability
|
||||
// is not ignored when the internal VAD is not used.
|
||||
TEST(GainController2,
|
||||
CheckInjectedVadProbabilityUsedWithAdaptiveDigitalController) {
|
||||
constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
|
||||
constexpr int kStereo = 2;
|
||||
|
||||
// Create AGC2 enabling only the adaptive digital controller.
|
||||
Agc2Config config;
|
||||
config.fixed_digital.gain_db = 0.0f;
|
||||
config.adaptive_digital.enabled = true;
|
||||
GainController2 agc2(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/false);
|
||||
GainController2 agc2_reference(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/true);
|
||||
|
||||
test::InputAudioFile input_file(
|
||||
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
|
||||
/*loop_at_end=*/true);
|
||||
const StreamConfig stream_config(kSampleRateHz, kStereo);
|
||||
|
||||
// Init buffers.
|
||||
constexpr int kFrameDurationMs = 10;
|
||||
std::vector<float> frame(kStereo * stream_config.num_frames());
|
||||
AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
|
||||
kSampleRateHz, kStereo);
|
||||
AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
|
||||
kStereo, kSampleRateHz, kStereo);
|
||||
// Simulate.
|
||||
constexpr float kGainDb = -6.0f;
|
||||
const float gain = std::pow(10.0f, kGainDb / 20.0f);
|
||||
constexpr int kDurationMs = 10000;
|
||||
constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
|
||||
constexpr float kSpeechProbabilities[] = {1.0f, 0.3f};
|
||||
constexpr float kEpsilon = 0.0001f;
|
||||
bool all_samples_zero = true;
|
||||
bool all_samples_equal = true;
|
||||
for (int i = 0, j = 0; i < kNumFramesToProcess; ++i, j = 1 - j) {
|
||||
ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
|
||||
stream_config.num_channels(), &input_file,
|
||||
frame);
|
||||
// Apply a fixed gain to the input audio.
|
||||
for (float& x : frame) {
|
||||
x *= gain;
|
||||
}
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
|
||||
agc2.Process(kSpeechProbabilities[j], &audio_buffer);
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame,
|
||||
&audio_buffer_reference);
|
||||
agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
|
||||
// Check the output buffers.
|
||||
for (int i = 0; i < kStereo; ++i) {
|
||||
for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
|
||||
all_samples_zero &=
|
||||
fabs(audio_buffer.channels_const()[i][j]) < kEpsilon;
|
||||
all_samples_equal &=
|
||||
fabs(audio_buffer.channels_const()[i][j] -
|
||||
audio_buffer_reference.channels_const()[i][j]) < kEpsilon;
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPECT_FALSE(all_samples_zero);
|
||||
EXPECT_FALSE(all_samples_equal);
|
||||
}
|
||||
|
||||
// Processes a test audio file and checks that the output is equal when
|
||||
// an injected speech probability from `VoiceActivityDetectorWrapper` and
|
||||
// the speech probability computed by the internal VAD are the same.
|
||||
TEST(GainController2,
|
||||
CheckEqualResultFromInjectedVadProbabilityWithAdaptiveDigitalController) {
|
||||
constexpr int kSampleRateHz = AudioProcessing::kSampleRate48kHz;
|
||||
constexpr int kStereo = 2;
|
||||
|
||||
// Create AGC2 enabling only the adaptive digital controller.
|
||||
Agc2Config config;
|
||||
config.fixed_digital.gain_db = 0.0f;
|
||||
config.adaptive_digital.enabled = true;
|
||||
GainController2 agc2(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/false);
|
||||
GainController2 agc2_reference(config, kSampleRateHz, kStereo,
|
||||
/*use_internal_vad=*/true);
|
||||
VoiceActivityDetectorWrapper vad(config.adaptive_digital.vad_reset_period_ms,
|
||||
GetAvailableCpuFeatures(), kSampleRateHz);
|
||||
test::InputAudioFile input_file(
|
||||
test::GetApmCaptureTestVectorFileName(kSampleRateHz),
|
||||
/*loop_at_end=*/true);
|
||||
const StreamConfig stream_config(kSampleRateHz, kStereo);
|
||||
|
||||
// Init buffers.
|
||||
constexpr int kFrameDurationMs = 10;
|
||||
std::vector<float> frame(kStereo * stream_config.num_frames());
|
||||
AudioBuffer audio_buffer(kSampleRateHz, kStereo, kSampleRateHz, kStereo,
|
||||
kSampleRateHz, kStereo);
|
||||
AudioBuffer audio_buffer_reference(kSampleRateHz, kStereo, kSampleRateHz,
|
||||
kStereo, kSampleRateHz, kStereo);
|
||||
|
||||
// Simulate.
|
||||
constexpr float kGainDb = -6.0f;
|
||||
const float gain = std::pow(10.0f, kGainDb / 20.0f);
|
||||
constexpr int kDurationMs = 10000;
|
||||
constexpr int kNumFramesToProcess = kDurationMs / kFrameDurationMs;
|
||||
for (int i = 0; i < kNumFramesToProcess; ++i) {
|
||||
ReadFloatSamplesFromStereoFile(stream_config.num_frames(),
|
||||
stream_config.num_channels(), &input_file,
|
||||
frame);
|
||||
// Apply a fixed gain to the input audio.
|
||||
for (float& x : frame) {
|
||||
x *= gain;
|
||||
}
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame,
|
||||
&audio_buffer_reference);
|
||||
agc2_reference.Process(absl::nullopt, &audio_buffer_reference);
|
||||
test::CopyVectorToAudioBuffer(stream_config, frame, &audio_buffer);
|
||||
agc2.Process(vad.Analyze(AudioFrameView<const float>(
|
||||
audio_buffer.channels(), audio_buffer.num_channels(),
|
||||
audio_buffer.num_frames())),
|
||||
&audio_buffer);
|
||||
// Check the output buffer.
|
||||
for (int i = 0; i < kStereo; ++i) {
|
||||
for (int j = 0; j < static_cast<int>(audio_buffer.num_frames()); ++j) {
|
||||
EXPECT_FLOAT_EQ(audio_buffer.channels_const()[i][j],
|
||||
audio_buffer_reference.channels_const()[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace webrtc
|
||||
|
||||
Reference in New Issue
Block a user