AGC2: gain increase allowed once enough adjacent speech frames observed
Make the digital adaptive gain applier more robust to VAD false positives. Achieved by allowing a gain increase only if enough adjacent speech frames are observed. Tested: - Bit-exactness verified with audioproc_f - If `kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold` == 2 then not bit-exact Bug: webrtc:7494 Change-Id: I3bab5a449aaf0ef1a64b671b413ba2ddb4688cd2 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/186042 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#32263}
This commit is contained in:
committed by
Commit Bot
parent
c5204017e1
commit
87b86acde9
@ -87,13 +87,23 @@ float ComputeGainChangeThisFrameDb(float target_gain_db,
|
||||
|
||||
AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier(
|
||||
ApmDataDumper* apm_data_dumper)
|
||||
: AdaptiveDigitalGainApplier(
|
||||
apm_data_dumper,
|
||||
kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold) {}
|
||||
|
||||
AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier(
|
||||
ApmDataDumper* apm_data_dumper,
|
||||
int adjacent_speech_frames_threshold)
|
||||
: apm_data_dumper_(apm_data_dumper),
|
||||
gain_applier_(
|
||||
/*hard_clip_samples=*/false,
|
||||
/*initial_gain_factor=*/DbToRatio(kInitialAdaptiveDigitalGainDb)),
|
||||
adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
|
||||
calls_since_last_gain_log_(0),
|
||||
gain_increase_allowed_(true),
|
||||
last_gain_db_(kInitialAdaptiveDigitalGainDb) {}
|
||||
frames_to_gain_increase_allowed_(adjacent_speech_frames_threshold_),
|
||||
last_gain_db_(kInitialAdaptiveDigitalGainDb) {
|
||||
RTC_DCHECK_GE(frames_to_gain_increase_allowed_, 1);
|
||||
}
|
||||
|
||||
void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
|
||||
AudioFrameView<float> frame) {
|
||||
@ -116,12 +126,17 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
|
||||
info.input_noise_level_dbfs, apm_data_dumper_),
|
||||
last_gain_db_, info.limiter_envelope_dbfs, info.estimate_is_confident);
|
||||
|
||||
// Forbid increasing the gain when there is no speech.
|
||||
gain_increase_allowed_ =
|
||||
info.vad_result.speech_probability > kVadConfidenceThreshold;
|
||||
// Forbid increasing the gain until enough adjacent speech frames are
|
||||
// observed.
|
||||
if (info.vad_result.speech_probability < kVadConfidenceThreshold) {
|
||||
frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_;
|
||||
} else if (frames_to_gain_increase_allowed_ > 0) {
|
||||
frames_to_gain_increase_allowed_--;
|
||||
}
|
||||
|
||||
const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb(
|
||||
target_gain_db, last_gain_db_, gain_increase_allowed_);
|
||||
target_gain_db, last_gain_db_,
|
||||
/*gain_increase_allowed=*/frames_to_gain_increase_allowed_ == 0);
|
||||
|
||||
apm_data_dumper_->DumpRaw("agc2_want_to_change_by_db",
|
||||
target_gain_db - last_gain_db_);
|
||||
|
||||
@ -35,6 +35,10 @@ class AdaptiveDigitalGainApplier {
|
||||
};
|
||||
|
||||
explicit AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper);
|
||||
// Ctor. `adjacent_speech_frames_threshold` indicates how many speech frames
|
||||
// are required before a gain increase is allowed.
|
||||
AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper,
|
||||
int adjacent_speech_frames_threshold);
|
||||
AdaptiveDigitalGainApplier(const AdaptiveDigitalGainApplier&) = delete;
|
||||
AdaptiveDigitalGainApplier& operator=(const AdaptiveDigitalGainApplier&) =
|
||||
delete;
|
||||
@ -46,8 +50,10 @@ class AdaptiveDigitalGainApplier {
|
||||
ApmDataDumper* const apm_data_dumper_;
|
||||
GainApplier gain_applier_;
|
||||
|
||||
const int adjacent_speech_frames_threshold_;
|
||||
|
||||
int calls_since_last_gain_log_;
|
||||
bool gain_increase_allowed_;
|
||||
int frames_to_gain_increase_allowed_;
|
||||
float last_gain_db_;
|
||||
};
|
||||
|
||||
|
||||
@ -21,6 +21,10 @@
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
constexpr int kMono = 1;
|
||||
constexpr int kStereo = 2;
|
||||
constexpr int kFrameLen10ms48kHz = 480;
|
||||
|
||||
// Constants used in place of estimated noise levels.
|
||||
constexpr float kNoNoiseDbfs = -90.f;
|
||||
constexpr float kWithNoiseDbfs = -20.f;
|
||||
@ -36,7 +40,7 @@ float RunOnConstantLevel(int num_iterations,
|
||||
float gain_linear = 0.f;
|
||||
|
||||
for (int i = 0; i < num_iterations; ++i) {
|
||||
VectorFloatFrame fake_audio(1, 1, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, 1, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info;
|
||||
info.input_level_dbfs = input_level_dbfs;
|
||||
info.input_noise_level_dbfs = kNoNoiseDbfs;
|
||||
@ -62,7 +66,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) {
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
|
||||
|
||||
// Make one call with reasonable audio level values and settings.
|
||||
VectorFloatFrame fake_audio(2, 480, 10000.f);
|
||||
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = -5.0;
|
||||
gain_applier.Process(kFrameInfo, fake_audio.float_frame_view());
|
||||
@ -114,7 +118,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
float last_gain_linear = 1.f;
|
||||
for (int i = 0; i < kNumFramesToAdapt; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame fake_audio(1, 1, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, 1, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
@ -127,7 +131,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
// Check that the same is true when gain decreases as well.
|
||||
for (int i = 0; i < kNumFramesToAdapt; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame fake_audio(1, 1, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, 1, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = 0.f;
|
||||
gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
@ -143,9 +147,8 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr int num_samples = 480;
|
||||
|
||||
VectorFloatFrame fake_audio(1, num_samples, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
@ -158,7 +161,8 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
|
||||
}
|
||||
|
||||
const float kMaxChangePerFrameLinear = DbToRatio(kMaxGainChangePerFrameDb);
|
||||
const float kMaxChangePerSample = kMaxChangePerFrameLinear / num_samples;
|
||||
const float kMaxChangePerSample =
|
||||
kMaxChangePerFrameLinear / kFrameLen10ms48kHz;
|
||||
|
||||
EXPECT_LE(maximal_difference, kMaxChangePerSample);
|
||||
}
|
||||
@ -168,7 +172,6 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr int num_samples = 480;
|
||||
constexpr int num_initial_frames =
|
||||
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
|
||||
constexpr int num_frames = 50;
|
||||
@ -176,7 +179,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
|
||||
ASSERT_GT(kWithNoiseDbfs, kMaxNoiseLevelDbfs) << "kWithNoiseDbfs is too low";
|
||||
|
||||
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
|
||||
VectorFloatFrame fake_audio(1, num_samples, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
info.input_noise_level_dbfs = kWithNoiseDbfs;
|
||||
@ -198,7 +201,7 @@ TEST(AutomaticGainController2GainApplier, CanHandlePositiveSpeechLevels) {
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
|
||||
|
||||
// Make one call with positive audio level values and settings.
|
||||
VectorFloatFrame fake_audio(2, 480, 10000.f);
|
||||
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = 5.f;
|
||||
gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
@ -209,7 +212,6 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper);
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr int num_samples = 480;
|
||||
constexpr int num_initial_frames =
|
||||
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
|
||||
constexpr int num_frames = 50;
|
||||
@ -217,7 +219,7 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
|
||||
ASSERT_GT(kWithNoiseDbfs, kMaxNoiseLevelDbfs) << "kWithNoiseDbfs is too low";
|
||||
|
||||
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
|
||||
VectorFloatFrame fake_audio(1, num_samples, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
info.limiter_envelope_dbfs = 1.f;
|
||||
@ -235,5 +237,59 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
|
||||
}
|
||||
}
|
||||
|
||||
class AdaptiveDigitalGainApplierTest : public ::testing::TestWithParam<int> {
|
||||
protected:
|
||||
int AdjacentSpeechFramesThreshold() const { return GetParam(); }
|
||||
};
|
||||
|
||||
TEST_P(AdaptiveDigitalGainApplierTest,
|
||||
DoNotIncreaseGainWithTooFewSpeechFrames) {
|
||||
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper,
|
||||
adjacent_speech_frames_threshold);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = -25.0;
|
||||
|
||||
float prev_gain = 0.f;
|
||||
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
gain_applier.Process(info, audio.float_frame_view());
|
||||
const float gain = audio.float_frame_view().channel(0)[0];
|
||||
if (i > 0) {
|
||||
EXPECT_EQ(prev_gain, gain); // No gain increase.
|
||||
}
|
||||
prev_gain = gain;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(AdaptiveDigitalGainApplierTest, IncreaseGainWithEnoughSpeechFrames) {
|
||||
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper,
|
||||
adjacent_speech_frames_threshold);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = -25.0;
|
||||
|
||||
float prev_gain = 0.f;
|
||||
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
gain_applier.Process(info, audio.float_frame_view());
|
||||
prev_gain = audio.float_frame_view().channel(0)[0];
|
||||
}
|
||||
|
||||
// Process one more speech frame.
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
gain_applier.Process(info, audio.float_frame_view());
|
||||
|
||||
// The gain has increased.
|
||||
EXPECT_GT(audio.float_frame_view().channel(0)[0], prev_gain);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AutomaticGainController2,
|
||||
AdaptiveDigitalGainApplierTest,
|
||||
::testing::Values(1, 7, 31));
|
||||
|
||||
} // namespace
|
||||
} // namespace webrtc
|
||||
|
||||
@ -64,7 +64,7 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
|
||||
: AdaptiveModeLevelEstimator(
|
||||
apm_data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator::kRms,
|
||||
kDefaultAdjacentSpeechFramesThreshold,
|
||||
kDefaultLevelEstimatorAdjacentSpeechFramesThreshold,
|
||||
kDefaultInitialSaturationMarginDb,
|
||||
kDefaultExtraSaturationMarginDb) {}
|
||||
|
||||
@ -73,11 +73,12 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
|
||||
AudioProcessing::Config::GainController2::LevelEstimator level_estimator,
|
||||
bool use_saturation_protector,
|
||||
float extra_saturation_margin_db)
|
||||
: AdaptiveModeLevelEstimator(apm_data_dumper,
|
||||
level_estimator,
|
||||
kDefaultAdjacentSpeechFramesThreshold,
|
||||
kDefaultInitialSaturationMarginDb,
|
||||
extra_saturation_margin_db) {
|
||||
: AdaptiveModeLevelEstimator(
|
||||
apm_data_dumper,
|
||||
level_estimator,
|
||||
kDefaultLevelEstimatorAdjacentSpeechFramesThreshold,
|
||||
kDefaultInitialSaturationMarginDb,
|
||||
extra_saturation_margin_db) {
|
||||
if (!use_saturation_protector) {
|
||||
RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled.";
|
||||
}
|
||||
|
||||
@ -51,7 +51,8 @@ constexpr float kInitialSpeechLevelEstimateDbfs = -30.f;
|
||||
|
||||
// Robust VAD probability and speech decisions.
|
||||
constexpr float kDefaultSmoothedVadProbabilityAttack = 1.f;
|
||||
constexpr int kDefaultAdjacentSpeechFramesThreshold = 1;
|
||||
constexpr int kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold = 1;
|
||||
constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 1;
|
||||
|
||||
// Saturation Protector settings.
|
||||
constexpr float kDefaultInitialSaturationMarginDb = 20.f;
|
||||
|
||||
Reference in New Issue
Block a user