AGC2: gain increase allowed once enough adjacent speech frames observed

Make the digital adaptive gain applier more robust to VAD false
positives. Achieved by allowing a gain increase only if enough adjacent
speech frames are observed.

Tested:
- Bit-exactness verified with audioproc_f
- If `kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold` == 2
  then not bit-exact

Bug: webrtc:7494
Change-Id: I3bab5a449aaf0ef1a64b671b413ba2ddb4688cd2
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/186042
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32263}
This commit is contained in:
Alessio Bazzica
2020-09-30 22:50:18 +02:00
committed by Commit Bot
parent c5204017e1
commit 87b86acde9
5 changed files with 105 additions and 26 deletions

View File

@ -87,13 +87,23 @@ float ComputeGainChangeThisFrameDb(float target_gain_db,
AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier(
ApmDataDumper* apm_data_dumper)
: AdaptiveDigitalGainApplier(
apm_data_dumper,
kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold) {}
AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier(
ApmDataDumper* apm_data_dumper,
int adjacent_speech_frames_threshold)
: apm_data_dumper_(apm_data_dumper),
gain_applier_(
/*hard_clip_samples=*/false,
/*initial_gain_factor=*/DbToRatio(kInitialAdaptiveDigitalGainDb)),
adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
calls_since_last_gain_log_(0),
gain_increase_allowed_(true),
last_gain_db_(kInitialAdaptiveDigitalGainDb) {}
frames_to_gain_increase_allowed_(adjacent_speech_frames_threshold_),
last_gain_db_(kInitialAdaptiveDigitalGainDb) {
RTC_DCHECK_GE(frames_to_gain_increase_allowed_, 1);
}
void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
AudioFrameView<float> frame) {
@ -116,12 +126,17 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
info.input_noise_level_dbfs, apm_data_dumper_),
last_gain_db_, info.limiter_envelope_dbfs, info.estimate_is_confident);
// Forbid increasing the gain when there is no speech.
gain_increase_allowed_ =
info.vad_result.speech_probability > kVadConfidenceThreshold;
// Forbid increasing the gain until enough adjacent speech frames are
// observed.
if (info.vad_result.speech_probability < kVadConfidenceThreshold) {
frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_;
} else if (frames_to_gain_increase_allowed_ > 0) {
frames_to_gain_increase_allowed_--;
}
const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb(
target_gain_db, last_gain_db_, gain_increase_allowed_);
target_gain_db, last_gain_db_,
/*gain_increase_allowed=*/frames_to_gain_increase_allowed_ == 0);
apm_data_dumper_->DumpRaw("agc2_want_to_change_by_db",
target_gain_db - last_gain_db_);

View File

@ -35,6 +35,10 @@ class AdaptiveDigitalGainApplier {
};
explicit AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper);
// Ctor. `adjacent_speech_frames_threshold` indicates how many speech frames
// are required before a gain increase is allowed.
AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper,
int adjacent_speech_frames_threshold);
AdaptiveDigitalGainApplier(const AdaptiveDigitalGainApplier&) = delete;
AdaptiveDigitalGainApplier& operator=(const AdaptiveDigitalGainApplier&) =
delete;
@ -46,8 +50,10 @@ class AdaptiveDigitalGainApplier {
ApmDataDumper* const apm_data_dumper_;
GainApplier gain_applier_;
const int adjacent_speech_frames_threshold_;
int calls_since_last_gain_log_;
bool gain_increase_allowed_;
int frames_to_gain_increase_allowed_;
float last_gain_db_;
};

View File

@ -21,6 +21,10 @@
namespace webrtc {
namespace {
constexpr int kMono = 1;
constexpr int kStereo = 2;
constexpr int kFrameLen10ms48kHz = 480;
// Constants used in place of estimated noise levels.
constexpr float kNoNoiseDbfs = -90.f;
constexpr float kWithNoiseDbfs = -20.f;
@ -36,7 +40,7 @@ float RunOnConstantLevel(int num_iterations,
float gain_linear = 0.f;
for (int i = 0; i < num_iterations; ++i) {
VectorFloatFrame fake_audio(1, 1, 1.f);
VectorFloatFrame fake_audio(kMono, 1, 1.f);
AdaptiveDigitalGainApplier::FrameInfo info;
info.input_level_dbfs = input_level_dbfs;
info.input_noise_level_dbfs = kNoNoiseDbfs;
@ -62,7 +66,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) {
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
// Make one call with reasonable audio level values and settings.
VectorFloatFrame fake_audio(2, 480, 10000.f);
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = -5.0;
gain_applier.Process(kFrameInfo, fake_audio.float_frame_view());
@ -114,7 +118,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
float last_gain_linear = 1.f;
for (int i = 0; i < kNumFramesToAdapt; ++i) {
SCOPED_TRACE(i);
VectorFloatFrame fake_audio(1, 1, 1.f);
VectorFloatFrame fake_audio(kMono, 1, 1.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = initial_level_dbfs;
gain_applier.Process(info, fake_audio.float_frame_view());
@ -127,7 +131,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
// Check that the same is true when gain decreases as well.
for (int i = 0; i < kNumFramesToAdapt; ++i) {
SCOPED_TRACE(i);
VectorFloatFrame fake_audio(1, 1, 1.f);
VectorFloatFrame fake_audio(kMono, 1, 1.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = 0.f;
gain_applier.Process(info, fake_audio.float_frame_view());
@ -143,9 +147,8 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
constexpr float initial_level_dbfs = -25.f;
constexpr int num_samples = 480;
VectorFloatFrame fake_audio(1, num_samples, 1.f);
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = initial_level_dbfs;
gain_applier.Process(info, fake_audio.float_frame_view());
@ -158,7 +161,8 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
}
const float kMaxChangePerFrameLinear = DbToRatio(kMaxGainChangePerFrameDb);
const float kMaxChangePerSample = kMaxChangePerFrameLinear / num_samples;
const float kMaxChangePerSample =
kMaxChangePerFrameLinear / kFrameLen10ms48kHz;
EXPECT_LE(maximal_difference, kMaxChangePerSample);
}
@ -168,7 +172,6 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
constexpr float initial_level_dbfs = -25.f;
constexpr int num_samples = 480;
constexpr int num_initial_frames =
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
constexpr int num_frames = 50;
@ -176,7 +179,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
ASSERT_GT(kWithNoiseDbfs, kMaxNoiseLevelDbfs) << "kWithNoiseDbfs is too low";
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
VectorFloatFrame fake_audio(1, num_samples, 1.f);
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = initial_level_dbfs;
info.input_noise_level_dbfs = kWithNoiseDbfs;
@ -198,7 +201,7 @@ TEST(AutomaticGainController2GainApplier, CanHandlePositiveSpeechLevels) {
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
// Make one call with positive audio level values and settings.
VectorFloatFrame fake_audio(2, 480, 10000.f);
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = 5.f;
gain_applier.Process(info, fake_audio.float_frame_view());
@ -209,7 +212,6 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
constexpr float initial_level_dbfs = -25.f;
constexpr int num_samples = 480;
constexpr int num_initial_frames =
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
constexpr int num_frames = 50;
@ -217,7 +219,7 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
ASSERT_GT(kWithNoiseDbfs, kMaxNoiseLevelDbfs) << "kWithNoiseDbfs is too low";
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
VectorFloatFrame fake_audio(1, num_samples, 1.f);
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = initial_level_dbfs;
info.limiter_envelope_dbfs = 1.f;
@ -235,5 +237,59 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
}
}
class AdaptiveDigitalGainApplierTest : public ::testing::TestWithParam<int> {
protected:
int AdjacentSpeechFramesThreshold() const { return GetParam(); }
};
TEST_P(AdaptiveDigitalGainApplierTest,
DoNotIncreaseGainWithTooFewSpeechFrames) {
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
ApmDataDumper apm_data_dumper(0);
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper,
adjacent_speech_frames_threshold);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = -25.0;
float prev_gain = 0.f;
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
SCOPED_TRACE(i);
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
gain_applier.Process(info, audio.float_frame_view());
const float gain = audio.float_frame_view().channel(0)[0];
if (i > 0) {
EXPECT_EQ(prev_gain, gain); // No gain increase.
}
prev_gain = gain;
}
}
TEST_P(AdaptiveDigitalGainApplierTest, IncreaseGainWithEnoughSpeechFrames) {
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
ApmDataDumper apm_data_dumper(0);
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper,
adjacent_speech_frames_threshold);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
info.input_level_dbfs = -25.0;
float prev_gain = 0.f;
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
gain_applier.Process(info, audio.float_frame_view());
prev_gain = audio.float_frame_view().channel(0)[0];
}
// Process one more speech frame.
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
gain_applier.Process(info, audio.float_frame_view());
// The gain has increased.
EXPECT_GT(audio.float_frame_view().channel(0)[0], prev_gain);
}
INSTANTIATE_TEST_SUITE_P(AutomaticGainController2,
AdaptiveDigitalGainApplierTest,
::testing::Values(1, 7, 31));
} // namespace
} // namespace webrtc

View File

@ -64,7 +64,7 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
: AdaptiveModeLevelEstimator(
apm_data_dumper,
AudioProcessing::Config::GainController2::LevelEstimator::kRms,
kDefaultAdjacentSpeechFramesThreshold,
kDefaultLevelEstimatorAdjacentSpeechFramesThreshold,
kDefaultInitialSaturationMarginDb,
kDefaultExtraSaturationMarginDb) {}
@ -73,11 +73,12 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
AudioProcessing::Config::GainController2::LevelEstimator level_estimator,
bool use_saturation_protector,
float extra_saturation_margin_db)
: AdaptiveModeLevelEstimator(apm_data_dumper,
level_estimator,
kDefaultAdjacentSpeechFramesThreshold,
kDefaultInitialSaturationMarginDb,
extra_saturation_margin_db) {
: AdaptiveModeLevelEstimator(
apm_data_dumper,
level_estimator,
kDefaultLevelEstimatorAdjacentSpeechFramesThreshold,
kDefaultInitialSaturationMarginDb,
extra_saturation_margin_db) {
if (!use_saturation_protector) {
RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled.";
}

View File

@ -51,7 +51,8 @@ constexpr float kInitialSpeechLevelEstimateDbfs = -30.f;
// Robust VAD probability and speech decisions.
constexpr float kDefaultSmoothedVadProbabilityAttack = 1.f;
constexpr int kDefaultAdjacentSpeechFramesThreshold = 1;
constexpr int kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold = 1;
constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 1;
// Saturation Protector settings.
constexpr float kDefaultInitialSaturationMarginDb = 20.f;