AGC2: retuning and large refactoring
- Bug fix: the desired initial gain quickly dropped to 0 dB hence starting a call with a too low level - New tuning to make AGC2 more robust to VAD mistakes - Smarter max gain increase speed: to deal with an increased threshold of adjacent speech frames, the gain applier temporarily allows a faster gain increase to deal with a longer time spent waiting for enough speech frames in a row to be observed - Saturation protector isolated from `AdaptiveModeLevelEstimator` to simplify the unit tests for the latter (non bit-exact change) - AGC2 adaptive digital config: unnecessary params deprecated - Code readability improvements - Data dumps clean-up and better naming Bug: webrtc:7494 Change-Id: I4e36059bdf2566cc2a7e1a7e95b7430ba9ae9844 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/215140 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Jesus de Vicente Pena <devicentepena@webrtc.org> Cr-Commit-Position: refs/heads/master@{#33736}
This commit is contained in:
committed by
Commit Bot
parent
d28434bd3f
commit
980c4601e1
@ -25,6 +25,8 @@ rtc_library("adaptive_digital") {
|
||||
"adaptive_mode_level_estimator.h",
|
||||
"saturation_protector.cc",
|
||||
"saturation_protector.h",
|
||||
"saturation_protector_buffer.cc",
|
||||
"saturation_protector_buffer.h",
|
||||
]
|
||||
|
||||
configs += [ "..:apm_debug_dump" ]
|
||||
@ -177,6 +179,7 @@ rtc_library("adaptive_digital_unittests") {
|
||||
"adaptive_digital_gain_applier_unittest.cc",
|
||||
"adaptive_mode_level_estimator_unittest.cc",
|
||||
"gain_applier_unittest.cc",
|
||||
"saturation_protector_buffer_unittest.cc",
|
||||
"saturation_protector_unittest.cc",
|
||||
]
|
||||
deps = [
|
||||
|
||||
@ -25,15 +25,6 @@ using AdaptiveDigitalConfig =
|
||||
using NoiseEstimatorType =
|
||||
AudioProcessing::Config::GainController2::NoiseEstimator;
|
||||
|
||||
void DumpDebugData(const AdaptiveDigitalGainApplier::FrameInfo& info,
|
||||
ApmDataDumper& dumper) {
|
||||
dumper.DumpRaw("agc2_vad_probability", info.vad_result.speech_probability);
|
||||
dumper.DumpRaw("agc2_vad_rms_dbfs", info.vad_result.rms_dbfs);
|
||||
dumper.DumpRaw("agc2_vad_peak_dbfs", info.vad_result.peak_dbfs);
|
||||
dumper.DumpRaw("agc2_noise_estimate_dbfs", info.input_noise_level_dbfs);
|
||||
dumper.DumpRaw("agc2_last_limiter_audio_level", info.limiter_envelope_dbfs);
|
||||
}
|
||||
|
||||
constexpr int kGainApplierAdjacentSpeechFramesThreshold = 1;
|
||||
constexpr float kMaxGainChangePerSecondDb = 3.0f;
|
||||
constexpr float kMaxOutputNoiseLevelDbfs = -50.0f;
|
||||
@ -72,36 +63,42 @@ constexpr NoiseEstimatorType kDefaultNoiseLevelEstimatorType =
|
||||
|
||||
AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper)
|
||||
: speech_level_estimator_(apm_data_dumper),
|
||||
gain_applier_(apm_data_dumper,
|
||||
kGainApplierAdjacentSpeechFramesThreshold,
|
||||
kMaxGainChangePerSecondDb,
|
||||
kMaxOutputNoiseLevelDbfs),
|
||||
gain_controller_(apm_data_dumper,
|
||||
kGainApplierAdjacentSpeechFramesThreshold,
|
||||
kMaxGainChangePerSecondDb,
|
||||
kMaxOutputNoiseLevelDbfs),
|
||||
apm_data_dumper_(apm_data_dumper),
|
||||
noise_level_estimator_(
|
||||
CreateNoiseLevelEstimator(kDefaultNoiseLevelEstimatorType,
|
||||
apm_data_dumper)),
|
||||
saturation_protector_(
|
||||
CreateSaturationProtector(kSaturationProtectorInitialHeadroomDb,
|
||||
kSaturationProtectorExtraHeadroomDb,
|
||||
kGainApplierAdjacentSpeechFramesThreshold,
|
||||
apm_data_dumper)) {
|
||||
RTC_DCHECK(apm_data_dumper);
|
||||
}
|
||||
|
||||
AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper,
|
||||
const AdaptiveDigitalConfig& config)
|
||||
: speech_level_estimator_(
|
||||
apm_data_dumper,
|
||||
config.level_estimator,
|
||||
config.level_estimator_adjacent_speech_frames_threshold,
|
||||
config.initial_saturation_margin_db,
|
||||
config.extra_saturation_margin_db),
|
||||
vad_(config.vad_reset_period_ms,
|
||||
config.vad_probability_attack,
|
||||
GetAllowedCpuFeatures(config)),
|
||||
gain_applier_(apm_data_dumper,
|
||||
config.gain_applier_adjacent_speech_frames_threshold,
|
||||
config.max_gain_change_db_per_second,
|
||||
config.max_output_noise_level_dbfs),
|
||||
: speech_level_estimator_(apm_data_dumper,
|
||||
config.adjacent_speech_frames_threshold),
|
||||
vad_(config.vad_reset_period_ms, GetAllowedCpuFeatures(config)),
|
||||
gain_controller_(apm_data_dumper,
|
||||
config.adjacent_speech_frames_threshold,
|
||||
config.max_gain_change_db_per_second,
|
||||
config.max_output_noise_level_dbfs),
|
||||
apm_data_dumper_(apm_data_dumper),
|
||||
noise_level_estimator_(
|
||||
CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)) {
|
||||
CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)),
|
||||
saturation_protector_(
|
||||
CreateSaturationProtector(kSaturationProtectorInitialHeadroomDb,
|
||||
kSaturationProtectorExtraHeadroomDb,
|
||||
config.adjacent_speech_frames_threshold,
|
||||
apm_data_dumper)) {
|
||||
RTC_DCHECK(apm_data_dumper);
|
||||
RTC_DCHECK(noise_level_estimator_);
|
||||
RTC_DCHECK(saturation_protector_);
|
||||
if (!config.use_saturation_protector) {
|
||||
RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled.";
|
||||
}
|
||||
@ -111,19 +108,39 @@ AdaptiveAgc::~AdaptiveAgc() = default;
|
||||
|
||||
void AdaptiveAgc::Process(AudioFrameView<float> frame, float limiter_envelope) {
|
||||
AdaptiveDigitalGainApplier::FrameInfo info;
|
||||
info.vad_result = vad_.AnalyzeFrame(frame);
|
||||
speech_level_estimator_.Update(info.vad_result);
|
||||
info.input_level_dbfs = speech_level_estimator_.level_dbfs();
|
||||
info.input_noise_level_dbfs = noise_level_estimator_->Analyze(frame);
|
||||
info.limiter_envelope_dbfs =
|
||||
limiter_envelope > 0 ? FloatS16ToDbfs(limiter_envelope) : -90.0f;
|
||||
info.estimate_is_confident = speech_level_estimator_.IsConfident();
|
||||
DumpDebugData(info, *apm_data_dumper_);
|
||||
gain_applier_.Process(info, frame);
|
||||
|
||||
VadLevelAnalyzer::Result vad_result = vad_.AnalyzeFrame(frame);
|
||||
info.speech_probability = vad_result.speech_probability;
|
||||
apm_data_dumper_->DumpRaw("agc2_speech_probability",
|
||||
vad_result.speech_probability);
|
||||
apm_data_dumper_->DumpRaw("agc2_input_rms_dbfs", vad_result.rms_dbfs);
|
||||
apm_data_dumper_->DumpRaw("agc2_input_peak_dbfs", vad_result.peak_dbfs);
|
||||
|
||||
speech_level_estimator_.Update(vad_result);
|
||||
info.speech_level_dbfs = speech_level_estimator_.level_dbfs();
|
||||
info.speech_level_reliable = speech_level_estimator_.IsConfident();
|
||||
apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", info.speech_level_dbfs);
|
||||
apm_data_dumper_->DumpRaw("agc2_speech_level_reliable",
|
||||
info.speech_level_reliable);
|
||||
|
||||
info.noise_rms_dbfs = noise_level_estimator_->Analyze(frame);
|
||||
apm_data_dumper_->DumpRaw("agc2_noise_rms_dbfs", info.noise_rms_dbfs);
|
||||
|
||||
saturation_protector_->Analyze(info.speech_probability, vad_result.peak_dbfs,
|
||||
info.speech_level_dbfs);
|
||||
info.headroom_db = saturation_protector_->HeadroomDb();
|
||||
apm_data_dumper_->DumpRaw("agc2_headroom_db", info.headroom_db);
|
||||
|
||||
info.limiter_envelope_dbfs = FloatS16ToDbfs(limiter_envelope);
|
||||
apm_data_dumper_->DumpRaw("agc2_limiter_envelope_dbfs",
|
||||
info.limiter_envelope_dbfs);
|
||||
|
||||
gain_controller_.Process(info, frame);
|
||||
}
|
||||
|
||||
void AdaptiveAgc::Reset() {
|
||||
void AdaptiveAgc::HandleInputGainChange() {
|
||||
speech_level_estimator_.Reset();
|
||||
saturation_protector_->Reset();
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
#include "modules/audio_processing/agc2/adaptive_digital_gain_applier.h"
|
||||
#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"
|
||||
#include "modules/audio_processing/agc2/noise_level_estimator.h"
|
||||
#include "modules/audio_processing/agc2/saturation_protector.h"
|
||||
#include "modules/audio_processing/agc2/vad_with_level.h"
|
||||
#include "modules/audio_processing/include/audio_frame_view.h"
|
||||
#include "modules/audio_processing/include/audio_processing.h"
|
||||
@ -38,14 +39,17 @@ class AdaptiveAgc {
|
||||
// account the envelope measured by the limiter.
|
||||
// TODO(crbug.com/webrtc/7494): Make the class depend on the limiter.
|
||||
void Process(AudioFrameView<float> frame, float limiter_envelope);
|
||||
void Reset();
|
||||
|
||||
// Handles a gain change applied to the input signal (e.g., analog gain).
|
||||
void HandleInputGainChange();
|
||||
|
||||
private:
|
||||
AdaptiveModeLevelEstimator speech_level_estimator_;
|
||||
VadLevelAnalyzer vad_;
|
||||
AdaptiveDigitalGainApplier gain_applier_;
|
||||
AdaptiveDigitalGainApplier gain_controller_;
|
||||
ApmDataDumper* const apm_data_dumper_;
|
||||
std::unique_ptr<NoiseLevelEstimator> noise_level_estimator_;
|
||||
std::unique_ptr<SaturationProtector> saturation_protector_;
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -23,6 +23,9 @@
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
constexpr int kHeadroomHistogramMin = 0;
|
||||
constexpr int kHeadroomHistogramMax = 50;
|
||||
|
||||
// This function maps input level to desired applied gain. We want to
|
||||
// boost the signal so that peaks are at -kHeadroomDbfs. We can't
|
||||
// apply more than kMaxGainDb gain.
|
||||
@ -31,17 +34,13 @@ float ComputeGainDb(float input_level_dbfs) {
|
||||
if (input_level_dbfs < -(kHeadroomDbfs + kMaxGainDb)) {
|
||||
return kMaxGainDb;
|
||||
}
|
||||
|
||||
// We expect to end up here most of the time: the level is below
|
||||
// -headroom, but we can boost it to -headroom.
|
||||
if (input_level_dbfs < -kHeadroomDbfs) {
|
||||
return -kHeadroomDbfs - input_level_dbfs;
|
||||
}
|
||||
|
||||
// Otherwise, the level is too high and we can't boost. The
|
||||
// LevelEstimator is responsible for not reporting bogus gain
|
||||
// values.
|
||||
RTC_DCHECK_LE(input_level_dbfs, 0.f);
|
||||
// Otherwise, the level is too high and we can't boost.
|
||||
RTC_DCHECK_GE(input_level_dbfs, -kHeadroomDbfs);
|
||||
return 0.f;
|
||||
}
|
||||
|
||||
@ -52,10 +51,11 @@ float LimitGainByNoise(float target_gain,
|
||||
float input_noise_level_dbfs,
|
||||
float max_output_noise_level_dbfs,
|
||||
ApmDataDumper& apm_data_dumper) {
|
||||
const float noise_headroom_db =
|
||||
const float max_allowed_gain_db =
|
||||
max_output_noise_level_dbfs - input_noise_level_dbfs;
|
||||
apm_data_dumper.DumpRaw("agc2_noise_headroom_db", noise_headroom_db);
|
||||
return std::min(target_gain, std::max(noise_headroom_db, 0.f));
|
||||
apm_data_dumper.DumpRaw("agc2_adaptive_gain_applier_max_allowed_gain_db",
|
||||
max_allowed_gain_db);
|
||||
return std::min(target_gain, std::max(max_allowed_gain_db, 0.f));
|
||||
}
|
||||
|
||||
float LimitGainByLowConfidence(float target_gain,
|
||||
@ -68,8 +68,8 @@ float LimitGainByLowConfidence(float target_gain,
|
||||
}
|
||||
const float limiter_level_before_gain = limiter_audio_level_dbfs - last_gain;
|
||||
|
||||
// Compute a new gain so that limiter_level_before_gain + new_gain <=
|
||||
// kLimiterThreshold.
|
||||
// Compute a new gain so that `limiter_level_before_gain` + `new_target_gain`
|
||||
// is not great than `kLimiterThresholdForAgcGainDbfs`.
|
||||
const float new_target_gain = std::max(
|
||||
kLimiterThresholdForAgcGainDbfs - limiter_level_before_gain, 0.f);
|
||||
return std::min(new_target_gain, target_gain);
|
||||
@ -80,13 +80,16 @@ float LimitGainByLowConfidence(float target_gain,
|
||||
float ComputeGainChangeThisFrameDb(float target_gain_db,
|
||||
float last_gain_db,
|
||||
bool gain_increase_allowed,
|
||||
float max_gain_change_db) {
|
||||
float max_gain_decrease_db,
|
||||
float max_gain_increase_db) {
|
||||
RTC_DCHECK_GT(max_gain_decrease_db, 0);
|
||||
RTC_DCHECK_GT(max_gain_increase_db, 0);
|
||||
float target_gain_difference_db = target_gain_db - last_gain_db;
|
||||
if (!gain_increase_allowed) {
|
||||
target_gain_difference_db = std::min(target_gain_difference_db, 0.f);
|
||||
}
|
||||
return rtc::SafeClamp(target_gain_difference_db, -max_gain_change_db,
|
||||
max_gain_change_db);
|
||||
return rtc::SafeClamp(target_gain_difference_db, -max_gain_decrease_db,
|
||||
max_gain_increase_db);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -115,7 +118,7 @@ AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier(
|
||||
|
||||
void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
|
||||
AudioFrameView<float> frame) {
|
||||
RTC_DCHECK_GE(info.input_level_dbfs, -150.f);
|
||||
RTC_DCHECK_GE(info.speech_level_dbfs, -150.f);
|
||||
RTC_DCHECK_GE(frame.num_channels(), 1);
|
||||
RTC_DCHECK(
|
||||
frame.samples_per_channel() == 80 || frame.samples_per_channel() == 160 ||
|
||||
@ -123,30 +126,46 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
|
||||
<< "`frame` does not look like a 10 ms frame for an APM supported sample "
|
||||
"rate";
|
||||
|
||||
// Compute the input level used to select the desired gain.
|
||||
RTC_DCHECK_GT(info.headroom_db, 0.0f);
|
||||
const float input_level_dbfs = info.speech_level_dbfs + info.headroom_db;
|
||||
|
||||
const float target_gain_db = LimitGainByLowConfidence(
|
||||
LimitGainByNoise(ComputeGainDb(std::min(info.input_level_dbfs, 0.f)),
|
||||
info.input_noise_level_dbfs,
|
||||
LimitGainByNoise(ComputeGainDb(input_level_dbfs), info.noise_rms_dbfs,
|
||||
max_output_noise_level_dbfs_, *apm_data_dumper_),
|
||||
last_gain_db_, info.limiter_envelope_dbfs, info.estimate_is_confident);
|
||||
last_gain_db_, info.limiter_envelope_dbfs, info.speech_level_reliable);
|
||||
|
||||
// Forbid increasing the gain until enough adjacent speech frames are
|
||||
// observed.
|
||||
if (info.vad_result.speech_probability < kVadConfidenceThreshold) {
|
||||
bool first_confident_speech_frame = false;
|
||||
if (info.speech_probability < kVadConfidenceThreshold) {
|
||||
frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_;
|
||||
} else if (frames_to_gain_increase_allowed_ > 0) {
|
||||
frames_to_gain_increase_allowed_--;
|
||||
first_confident_speech_frame = frames_to_gain_increase_allowed_ == 0;
|
||||
}
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_gain_applier_frames_to_gain_increase_allowed",
|
||||
frames_to_gain_increase_allowed_);
|
||||
|
||||
const bool gain_increase_allowed = frames_to_gain_increase_allowed_ == 0;
|
||||
|
||||
float max_gain_increase_db = max_gain_change_db_per_10ms_;
|
||||
if (first_confident_speech_frame) {
|
||||
// No gain increase happened while waiting for a long enough speech
|
||||
// sequence. Therefore, temporarily allow a faster gain increase.
|
||||
RTC_DCHECK(gain_increase_allowed);
|
||||
max_gain_increase_db *= adjacent_speech_frames_threshold_;
|
||||
}
|
||||
apm_data_dumper_->DumpRaw("agc2_frames_to_gain_increase_allowed",
|
||||
frames_to_gain_increase_allowed_);
|
||||
|
||||
const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb(
|
||||
target_gain_db, last_gain_db_,
|
||||
/*gain_increase_allowed=*/frames_to_gain_increase_allowed_ == 0,
|
||||
max_gain_change_db_per_10ms_);
|
||||
target_gain_db, last_gain_db_, gain_increase_allowed,
|
||||
/*max_gain_decrease_db=*/max_gain_change_db_per_10ms_,
|
||||
max_gain_increase_db);
|
||||
|
||||
apm_data_dumper_->DumpRaw("agc2_want_to_change_by_db",
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_want_to_change_by_db",
|
||||
target_gain_db - last_gain_db_);
|
||||
apm_data_dumper_->DumpRaw("agc2_will_change_by_db",
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_will_change_by_db",
|
||||
gain_change_this_frame_db);
|
||||
|
||||
// Optimization: avoid calling math functions if gain does not
|
||||
@ -159,23 +178,29 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
|
||||
|
||||
// Remember that the gain has changed for the next iteration.
|
||||
last_gain_db_ = last_gain_db_ + gain_change_this_frame_db;
|
||||
apm_data_dumper_->DumpRaw("agc2_applied_gain_db", last_gain_db_);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_applied_gain_db",
|
||||
last_gain_db_);
|
||||
|
||||
// Log every 10 seconds.
|
||||
calls_since_last_gain_log_++;
|
||||
if (calls_since_last_gain_log_ == 1000) {
|
||||
calls_since_last_gain_log_ = 0;
|
||||
RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedSpeechLevel",
|
||||
-info.speech_level_dbfs, 0, 100, 101);
|
||||
RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedNoiseLevel",
|
||||
-info.noise_rms_dbfs, 0, 100, 101);
|
||||
RTC_HISTOGRAM_COUNTS_LINEAR(
|
||||
"WebRTC.Audio.Agc2.Headroom", info.headroom_db, kHeadroomHistogramMin,
|
||||
kHeadroomHistogramMax,
|
||||
kHeadroomHistogramMax - kHeadroomHistogramMin + 1);
|
||||
RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.DigitalGainApplied",
|
||||
last_gain_db_, 0, kMaxGainDb, kMaxGainDb + 1);
|
||||
RTC_HISTOGRAM_COUNTS_LINEAR(
|
||||
"WebRTC.Audio.Agc2.EstimatedSpeechPlusNoiseLevel",
|
||||
-info.input_level_dbfs, 0, 100, 101);
|
||||
RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedNoiseLevel",
|
||||
-info.input_noise_level_dbfs, 0, 100, 101);
|
||||
RTC_LOG(LS_INFO) << "AGC2 adaptive digital"
|
||||
<< " | speech_plus_noise_dbfs: " << info.input_level_dbfs
|
||||
<< " | noise_dbfs: " << info.input_noise_level_dbfs
|
||||
<< " | speech_dbfs: " << info.speech_level_dbfs
|
||||
<< " | noise_dbfs: " << info.noise_rms_dbfs
|
||||
<< " | headroom_db: " << info.headroom_db
|
||||
<< " | gain_db: " << last_gain_db_;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -12,33 +12,32 @@
|
||||
#define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_DIGITAL_GAIN_APPLIER_H_
|
||||
|
||||
#include "modules/audio_processing/agc2/gain_applier.h"
|
||||
#include "modules/audio_processing/agc2/vad_with_level.h"
|
||||
#include "modules/audio_processing/include/audio_frame_view.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
class ApmDataDumper;
|
||||
|
||||
// Part of the adaptive digital controller that applies a digital adaptive gain.
|
||||
// The gain is updated towards a target. The logic decides when gain updates are
|
||||
// allowed, it controls the adaptation speed and caps the target based on the
|
||||
// estimated noise level and the speech level estimate confidence.
|
||||
// TODO(bugs.webrtc.org): Split into `GainAdaptor` and `GainApplier`.
|
||||
// Selects the target digital gain, decides when and how quickly to adapt to the
|
||||
// target and applies the current gain to 10 ms frames.
|
||||
class AdaptiveDigitalGainApplier {
|
||||
public:
|
||||
// Information about a frame to process.
|
||||
struct FrameInfo {
|
||||
float input_level_dbfs; // Estimated speech plus noise level.
|
||||
float input_noise_level_dbfs; // Estimated noise level.
|
||||
VadLevelAnalyzer::Result vad_result;
|
||||
float limiter_envelope_dbfs; // Envelope level from the limiter.
|
||||
bool estimate_is_confident;
|
||||
float speech_probability; // Probability of speech in the [0, 1] range.
|
||||
float speech_level_dbfs; // Estimated speech level (dBFS).
|
||||
bool speech_level_reliable; // True with reliable speech level estimation.
|
||||
float noise_rms_dbfs; // Estimated noise RMS level (dBFS).
|
||||
float headroom_db; // Headroom (dB).
|
||||
float limiter_envelope_dbfs; // Envelope level from the limiter (dBFS).
|
||||
};
|
||||
|
||||
// Ctor.
|
||||
// `adjacent_speech_frames_threshold` indicates how many speech frames are
|
||||
// required before a gain increase is allowed. `max_gain_change_db_per_second`
|
||||
// limits the adaptation speed (uniformly operated across frames).
|
||||
// `max_output_noise_level_dbfs` limits the output noise level.
|
||||
// Ctor. `adjacent_speech_frames_threshold` indicates how many adjacent speech
|
||||
// frames must be observed in order to consider the sequence as speech.
|
||||
// `max_gain_change_db_per_second` limits the adaptation speed (uniformly
|
||||
// operated across frames). `max_output_noise_level_dbfs` limits the output
|
||||
// noise level.
|
||||
AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper,
|
||||
int adjacent_speech_frames_threshold,
|
||||
float max_gain_change_db_per_second,
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#include "modules/audio_processing/agc2/adaptive_digital_gain_applier.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include "common_audio/include/audio_util.h"
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
@ -26,104 +27,75 @@ constexpr int kStereo = 2;
|
||||
constexpr int kFrameLen10ms8kHz = 80;
|
||||
constexpr int kFrameLen10ms48kHz = 480;
|
||||
|
||||
constexpr float kMaxSpeechProbability = 1.0f;
|
||||
|
||||
// Constants used in place of estimated noise levels.
|
||||
constexpr float kNoNoiseDbfs = -90.f;
|
||||
constexpr float kNoNoiseDbfs = kMinLevelDbfs;
|
||||
constexpr float kWithNoiseDbfs = -20.f;
|
||||
static_assert(std::is_trivially_destructible<VadLevelAnalyzer::Result>::value,
|
||||
"");
|
||||
constexpr VadLevelAnalyzer::Result kVadSpeech{1.f, -20.f, 0.f};
|
||||
|
||||
constexpr float kMaxGainChangePerSecondDb = 3.f;
|
||||
constexpr float kMaxGainChangePerSecondDb = 3.0f;
|
||||
constexpr float kMaxGainChangePerFrameDb =
|
||||
kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.f;
|
||||
constexpr float kMaxOutputNoiseLevelDbfs = -50.f;
|
||||
kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.0f;
|
||||
constexpr float kMaxOutputNoiseLevelDbfs = -50.0f;
|
||||
|
||||
// Helper to instance `AdaptiveDigitalGainApplier`.
|
||||
// Helper to create initialized `AdaptiveDigitalGainApplier` objects.
|
||||
struct GainApplierHelper {
|
||||
GainApplierHelper()
|
||||
: GainApplierHelper(/*adjacent_speech_frames_threshold=*/1) {}
|
||||
explicit GainApplierHelper(int adjacent_speech_frames_threshold)
|
||||
: apm_data_dumper(0),
|
||||
gain_applier(&apm_data_dumper,
|
||||
adjacent_speech_frames_threshold,
|
||||
kMaxGainChangePerSecondDb,
|
||||
kMaxOutputNoiseLevelDbfs) {}
|
||||
gain_applier(std::make_unique<AdaptiveDigitalGainApplier>(
|
||||
&apm_data_dumper,
|
||||
adjacent_speech_frames_threshold,
|
||||
kMaxGainChangePerSecondDb,
|
||||
kMaxOutputNoiseLevelDbfs)) {}
|
||||
ApmDataDumper apm_data_dumper;
|
||||
AdaptiveDigitalGainApplier gain_applier;
|
||||
std::unique_ptr<AdaptiveDigitalGainApplier> gain_applier;
|
||||
};
|
||||
|
||||
// Runs gain applier and returns the applied gain in linear scale.
|
||||
float RunOnConstantLevel(int num_iterations,
|
||||
VadLevelAnalyzer::Result vad_level,
|
||||
float input_level_dbfs,
|
||||
AdaptiveDigitalGainApplier* gain_applier) {
|
||||
float gain_linear = 0.f;
|
||||
|
||||
for (int i = 0; i < num_iterations; ++i) {
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info;
|
||||
info.input_level_dbfs = input_level_dbfs;
|
||||
info.input_noise_level_dbfs = kNoNoiseDbfs;
|
||||
info.vad_result = vad_level;
|
||||
info.limiter_envelope_dbfs = -2.f;
|
||||
info.estimate_is_confident = true;
|
||||
gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
gain_linear = fake_audio.float_frame_view().channel(0)[0];
|
||||
}
|
||||
return gain_linear;
|
||||
}
|
||||
|
||||
// Voice on, no noise, low limiter, confident level.
|
||||
static_assert(std::is_trivially_destructible<
|
||||
AdaptiveDigitalGainApplier::FrameInfo>::value,
|
||||
"");
|
||||
constexpr AdaptiveDigitalGainApplier::FrameInfo kFrameInfo{
|
||||
/*input_level_dbfs=*/-1.f,
|
||||
/*input_noise_level_dbfs=*/kNoNoiseDbfs,
|
||||
/*vad_result=*/kVadSpeech,
|
||||
/*limiter_envelope_dbfs=*/-2.f,
|
||||
/*estimate_is_confident=*/true};
|
||||
/*speech_probability=*/kMaxSpeechProbability,
|
||||
/*speech_level_dbfs=*/kInitialSpeechLevelEstimateDbfs,
|
||||
/*speech_level_reliable=*/true,
|
||||
/*noise_rms_dbfs=*/kNoNoiseDbfs,
|
||||
/*headroom_db=*/kSaturationProtectorInitialHeadroomDb,
|
||||
/*limiter_envelope_dbfs=*/-2.0f};
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) {
|
||||
TEST(GainController2AdaptiveGainApplier, GainApplierShouldNotCrash) {
|
||||
GainApplierHelper helper;
|
||||
// Make one call with reasonable audio level values and settings.
|
||||
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
|
||||
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = -5.0;
|
||||
helper.gain_applier.Process(kFrameInfo, fake_audio.float_frame_view());
|
||||
info.speech_level_dbfs = -5.0f;
|
||||
helper.gain_applier->Process(kFrameInfo, fake_audio.float_frame_view());
|
||||
}
|
||||
|
||||
// Check that the output is -kHeadroom dBFS.
|
||||
TEST(AutomaticGainController2AdaptiveGainApplier, TargetLevelIsReached) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
constexpr float initial_level_dbfs = -5.f;
|
||||
|
||||
const float applied_gain = RunOnConstantLevel(
|
||||
200, kVadSpeech, initial_level_dbfs, &helper.gain_applier);
|
||||
|
||||
EXPECT_NEAR(applied_gain, DbToRatio(-kHeadroomDbfs - initial_level_dbfs),
|
||||
0.1f);
|
||||
}
|
||||
|
||||
// Check that the output is -kHeadroom dBFS
|
||||
TEST(AutomaticGainController2AdaptiveGainApplier, GainApproachesMaxGain) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
constexpr float initial_level_dbfs = -kHeadroomDbfs - kMaxGainDb - 10.f;
|
||||
// A few extra frames for safety.
|
||||
// Checks that the maximum allowed gain is applied.
|
||||
TEST(GainController2AdaptiveGainApplier, MaxGainApplied) {
|
||||
constexpr int kNumFramesToAdapt =
|
||||
static_cast<int>(kMaxGainDb / kMaxGainChangePerFrameDb) + 10;
|
||||
|
||||
const float applied_gain = RunOnConstantLevel(
|
||||
kNumFramesToAdapt, kVadSpeech, initial_level_dbfs, &helper.gain_applier);
|
||||
EXPECT_NEAR(applied_gain, DbToRatio(kMaxGainDb), 0.1f);
|
||||
|
||||
const float applied_gain_db = 20.f * std::log10(applied_gain);
|
||||
GainApplierHelper helper;
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.speech_level_dbfs = -60.0f;
|
||||
float applied_gain;
|
||||
for (int i = 0; i < kNumFramesToAdapt; ++i) {
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f);
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
applied_gain = fake_audio.float_frame_view().channel(0)[0];
|
||||
}
|
||||
const float applied_gain_db = 20.0f * std::log10f(applied_gain);
|
||||
EXPECT_NEAR(applied_gain_db, kMaxGainDb, 0.1f);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
TEST(GainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr float initial_level_dbfs = -25.0f;
|
||||
// A few extra frames for safety.
|
||||
constexpr int kNumFramesToAdapt =
|
||||
static_cast<int>(initial_level_dbfs / kMaxGainChangePerFrameDb) + 10;
|
||||
@ -133,10 +105,10 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
float last_gain_linear = 1.f;
|
||||
for (int i = 0; i < kNumFramesToAdapt; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
helper.gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
info.speech_level_dbfs = initial_level_dbfs;
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
float current_gain_linear = fake_audio.float_frame_view().channel(0)[0];
|
||||
EXPECT_LE(std::abs(current_gain_linear - last_gain_linear),
|
||||
kMaxChangePerFrameLinear);
|
||||
@ -146,10 +118,10 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
// Check that the same is true when gain decreases as well.
|
||||
for (int i = 0; i < kNumFramesToAdapt; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = 0.f;
|
||||
helper.gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
info.speech_level_dbfs = 0.f;
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
float current_gain_linear = fake_audio.float_frame_view().channel(0)[0];
|
||||
EXPECT_LE(std::abs(current_gain_linear - last_gain_linear),
|
||||
kMaxChangePerFrameLinear);
|
||||
@ -157,17 +129,17 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
|
||||
TEST(GainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr float initial_level_dbfs = -25.0f;
|
||||
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
helper.gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
float maximal_difference = 0.f;
|
||||
float current_value = 1.f * DbToRatio(kInitialAdaptiveDigitalGainDb);
|
||||
info.speech_level_dbfs = initial_level_dbfs;
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
float maximal_difference = 0.0f;
|
||||
float current_value = 1.0f * DbToRatio(kInitialAdaptiveDigitalGainDb);
|
||||
for (const auto& x : fake_audio.float_frame_view().channel(0)) {
|
||||
const float difference = std::abs(x - current_value);
|
||||
maximal_difference = std::max(maximal_difference, difference);
|
||||
@ -181,10 +153,10 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
|
||||
EXPECT_LE(maximal_difference, kMaxChangePerSample);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
|
||||
TEST(GainController2AdaptiveGainApplier, NoiseLimitsGain) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr float initial_level_dbfs = -25.0f;
|
||||
constexpr int num_initial_frames =
|
||||
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
|
||||
constexpr int num_frames = 50;
|
||||
@ -193,11 +165,11 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
|
||||
<< "kWithNoiseDbfs is too low";
|
||||
|
||||
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
info.input_noise_level_dbfs = kWithNoiseDbfs;
|
||||
helper.gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
info.speech_level_dbfs = initial_level_dbfs;
|
||||
info.noise_rms_dbfs = kWithNoiseDbfs;
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
|
||||
// Wait so that the adaptive gain applier has time to lower the gain.
|
||||
if (i > num_initial_frames) {
|
||||
@ -205,25 +177,25 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
|
||||
*std::max_element(fake_audio.float_frame_view().channel(0).begin(),
|
||||
fake_audio.float_frame_view().channel(0).end());
|
||||
|
||||
EXPECT_NEAR(maximal_ratio, 1.f, 0.001f);
|
||||
EXPECT_NEAR(maximal_ratio, 1.0f, 0.001f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2GainApplier, CanHandlePositiveSpeechLevels) {
|
||||
TEST(GainController2GainApplier, CanHandlePositiveSpeechLevels) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
// Make one call with positive audio level values and settings.
|
||||
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
|
||||
VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = 5.f;
|
||||
helper.gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
info.speech_level_dbfs = 5.0f;
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
|
||||
TEST(GainController2GainApplier, AudioLevelLimitsGain) {
|
||||
GainApplierHelper helper;
|
||||
|
||||
constexpr float initial_level_dbfs = -25.f;
|
||||
constexpr float initial_level_dbfs = -25.0f;
|
||||
constexpr int num_initial_frames =
|
||||
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
|
||||
constexpr int num_frames = 50;
|
||||
@ -232,12 +204,12 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
|
||||
<< "kWithNoiseDbfs is too low";
|
||||
|
||||
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f);
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = initial_level_dbfs;
|
||||
info.limiter_envelope_dbfs = 1.f;
|
||||
info.estimate_is_confident = false;
|
||||
helper.gain_applier.Process(info, fake_audio.float_frame_view());
|
||||
info.speech_level_dbfs = initial_level_dbfs;
|
||||
info.limiter_envelope_dbfs = 1.0f;
|
||||
info.speech_level_reliable = false;
|
||||
helper.gain_applier->Process(info, fake_audio.float_frame_view());
|
||||
|
||||
// Wait so that the adaptive gain applier has time to lower the gain.
|
||||
if (i > num_initial_frames) {
|
||||
@ -245,7 +217,7 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
|
||||
*std::max_element(fake_audio.float_frame_view().channel(0).begin(),
|
||||
fake_audio.float_frame_view().channel(0).end());
|
||||
|
||||
EXPECT_NEAR(maximal_ratio, 1.f, 0.001f);
|
||||
EXPECT_NEAR(maximal_ratio, 1.0f, 0.001f);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -260,14 +232,11 @@ TEST_P(AdaptiveDigitalGainApplierTest,
|
||||
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
|
||||
GainApplierHelper helper(adjacent_speech_frames_threshold);
|
||||
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = -25.0;
|
||||
|
||||
float prev_gain = 0.f;
|
||||
float prev_gain = 0.0f;
|
||||
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
helper.gain_applier.Process(info, audio.float_frame_view());
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f);
|
||||
helper.gain_applier->Process(kFrameInfo, audio.float_frame_view());
|
||||
const float gain = audio.float_frame_view().channel(0)[0];
|
||||
if (i > 0) {
|
||||
EXPECT_EQ(prev_gain, gain); // No gain increase.
|
||||
@ -280,25 +249,23 @@ TEST_P(AdaptiveDigitalGainApplierTest, IncreaseGainWithEnoughSpeechFrames) {
|
||||
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
|
||||
GainApplierHelper helper(adjacent_speech_frames_threshold);
|
||||
|
||||
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
|
||||
info.input_level_dbfs = -25.0;
|
||||
|
||||
float prev_gain = 0.f;
|
||||
float prev_gain = 0.0f;
|
||||
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
helper.gain_applier.Process(info, audio.float_frame_view());
|
||||
SCOPED_TRACE(i);
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f);
|
||||
helper.gain_applier->Process(kFrameInfo, audio.float_frame_view());
|
||||
prev_gain = audio.float_frame_view().channel(0)[0];
|
||||
}
|
||||
|
||||
// Process one more speech frame.
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
|
||||
helper.gain_applier.Process(info, audio.float_frame_view());
|
||||
VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f);
|
||||
helper.gain_applier->Process(kFrameInfo, audio.float_frame_view());
|
||||
|
||||
// The gain has increased.
|
||||
EXPECT_GT(audio.float_frame_view().channel(0)[0], prev_gain);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AutomaticGainController2,
|
||||
INSTANTIATE_TEST_SUITE_P(GainController2,
|
||||
AdaptiveDigitalGainApplierTest,
|
||||
::testing::Values(1, 7, 31));
|
||||
|
||||
|
||||
@ -22,37 +22,17 @@ namespace {
|
||||
using LevelEstimatorType =
|
||||
AudioProcessing::Config::GainController2::LevelEstimator;
|
||||
|
||||
// Combines a level estimation with the saturation protector margins.
|
||||
float ComputeLevelEstimateDbfs(float level_estimate_dbfs,
|
||||
float saturation_margin_db,
|
||||
float extra_saturation_margin_db) {
|
||||
return rtc::SafeClamp<float>(
|
||||
level_estimate_dbfs + saturation_margin_db + extra_saturation_margin_db,
|
||||
-90.f, 30.f);
|
||||
}
|
||||
|
||||
// Returns the level of given type from `vad_level`.
|
||||
float GetLevel(const VadLevelAnalyzer::Result& vad_level,
|
||||
LevelEstimatorType type) {
|
||||
switch (type) {
|
||||
case LevelEstimatorType::kRms:
|
||||
return vad_level.rms_dbfs;
|
||||
break;
|
||||
case LevelEstimatorType::kPeak:
|
||||
return vad_level.peak_dbfs;
|
||||
break;
|
||||
}
|
||||
RTC_CHECK_NOTREACHED();
|
||||
float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
|
||||
return rtc::SafeClamp<float>(level_estimate_dbfs, -90.f, 30.f);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool AdaptiveModeLevelEstimator::LevelEstimatorState::operator==(
|
||||
const AdaptiveModeLevelEstimator::LevelEstimatorState& b) const {
|
||||
return time_to_full_buffer_ms == b.time_to_full_buffer_ms &&
|
||||
return time_to_confidence_ms == b.time_to_confidence_ms &&
|
||||
level_dbfs.numerator == b.level_dbfs.numerator &&
|
||||
level_dbfs.denominator == b.level_dbfs.denominator &&
|
||||
saturation_protector == b.saturation_protector;
|
||||
level_dbfs.denominator == b.level_dbfs.denominator;
|
||||
}
|
||||
|
||||
float AdaptiveModeLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
|
||||
@ -64,25 +44,14 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
|
||||
ApmDataDumper* apm_data_dumper)
|
||||
: AdaptiveModeLevelEstimator(
|
||||
apm_data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator::kRms,
|
||||
kDefaultLevelEstimatorAdjacentSpeechFramesThreshold,
|
||||
kDefaultInitialSaturationMarginDb,
|
||||
kDefaultExtraSaturationMarginDb) {}
|
||||
kDefaultLevelEstimatorAdjacentSpeechFramesThreshold) {}
|
||||
|
||||
AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
|
||||
ApmDataDumper* apm_data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator level_estimator,
|
||||
int adjacent_speech_frames_threshold,
|
||||
float initial_saturation_margin_db,
|
||||
float extra_saturation_margin_db)
|
||||
int adjacent_speech_frames_threshold)
|
||||
: apm_data_dumper_(apm_data_dumper),
|
||||
level_estimator_type_(level_estimator),
|
||||
adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
|
||||
initial_saturation_margin_db_(initial_saturation_margin_db),
|
||||
extra_saturation_margin_db_(extra_saturation_margin_db),
|
||||
level_dbfs_(ComputeLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs,
|
||||
initial_saturation_margin_db_,
|
||||
extra_saturation_margin_db_)) {
|
||||
level_dbfs_(ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs)) {
|
||||
RTC_DCHECK(apm_data_dumper_);
|
||||
RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
|
||||
Reset();
|
||||
@ -96,8 +65,6 @@ void AdaptiveModeLevelEstimator::Update(
|
||||
RTC_DCHECK_LT(vad_level.peak_dbfs, 50.f);
|
||||
RTC_DCHECK_GE(vad_level.speech_probability, 0.f);
|
||||
RTC_DCHECK_LE(vad_level.speech_probability, 1.f);
|
||||
DumpDebugData();
|
||||
|
||||
if (vad_level.speech_probability < kVadConfidenceThreshold) {
|
||||
// Not a speech frame.
|
||||
if (adjacent_speech_frames_threshold_ > 1) {
|
||||
@ -115,89 +82,82 @@ void AdaptiveModeLevelEstimator::Update(
|
||||
}
|
||||
}
|
||||
num_adjacent_speech_frames_ = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// Speech frame observed.
|
||||
num_adjacent_speech_frames_++;
|
||||
|
||||
// Update preliminary level estimate.
|
||||
RTC_DCHECK_GE(preliminary_state_.time_to_full_buffer_ms, 0);
|
||||
const bool buffer_is_full = preliminary_state_.time_to_full_buffer_ms == 0;
|
||||
if (!buffer_is_full) {
|
||||
preliminary_state_.time_to_full_buffer_ms -= kFrameDurationMs;
|
||||
}
|
||||
// Weighted average of levels with speech probability as weight.
|
||||
RTC_DCHECK_GT(vad_level.speech_probability, 0.f);
|
||||
const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f;
|
||||
preliminary_state_.level_dbfs.numerator =
|
||||
preliminary_state_.level_dbfs.numerator * leak_factor +
|
||||
GetLevel(vad_level, level_estimator_type_) * vad_level.speech_probability;
|
||||
preliminary_state_.level_dbfs.denominator =
|
||||
preliminary_state_.level_dbfs.denominator * leak_factor +
|
||||
vad_level.speech_probability;
|
||||
|
||||
const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
|
||||
|
||||
UpdateSaturationProtectorState(vad_level.peak_dbfs, level_dbfs,
|
||||
preliminary_state_.saturation_protector);
|
||||
|
||||
if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
|
||||
// `preliminary_state_` is now reliable. Update the last level estimation.
|
||||
level_dbfs_ = ComputeLevelEstimateDbfs(
|
||||
level_dbfs, preliminary_state_.saturation_protector.margin_db,
|
||||
extra_saturation_margin_db_);
|
||||
} else {
|
||||
// Speech frame observed.
|
||||
num_adjacent_speech_frames_++;
|
||||
|
||||
// Update preliminary level estimate.
|
||||
RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
|
||||
const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
|
||||
if (!buffer_is_full) {
|
||||
preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
|
||||
}
|
||||
// Weighted average of levels with speech probability as weight.
|
||||
RTC_DCHECK_GT(vad_level.speech_probability, 0.f);
|
||||
const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.f;
|
||||
preliminary_state_.level_dbfs.numerator =
|
||||
preliminary_state_.level_dbfs.numerator * leak_factor +
|
||||
vad_level.rms_dbfs * vad_level.speech_probability;
|
||||
preliminary_state_.level_dbfs.denominator =
|
||||
preliminary_state_.level_dbfs.denominator * leak_factor +
|
||||
vad_level.speech_probability;
|
||||
|
||||
const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
|
||||
|
||||
if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
|
||||
// `preliminary_state_` is now reliable. Update the last level estimation.
|
||||
level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
|
||||
}
|
||||
}
|
||||
DumpDebugData();
|
||||
}
|
||||
|
||||
bool AdaptiveModeLevelEstimator::IsConfident() const {
|
||||
if (adjacent_speech_frames_threshold_ == 1) {
|
||||
// Ignore `reliable_state_` when a single frame is enough to update the
|
||||
// level estimate (because it is not used).
|
||||
return preliminary_state_.time_to_full_buffer_ms == 0;
|
||||
return preliminary_state_.time_to_confidence_ms == 0;
|
||||
}
|
||||
// Once confident, it remains confident.
|
||||
RTC_DCHECK(reliable_state_.time_to_full_buffer_ms != 0 ||
|
||||
preliminary_state_.time_to_full_buffer_ms == 0);
|
||||
RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
|
||||
preliminary_state_.time_to_confidence_ms == 0);
|
||||
// During the first long enough speech sequence, `reliable_state_` must be
|
||||
// ignored since `preliminary_state_` is used.
|
||||
return reliable_state_.time_to_full_buffer_ms == 0 ||
|
||||
return reliable_state_.time_to_confidence_ms == 0 ||
|
||||
(num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
|
||||
preliminary_state_.time_to_full_buffer_ms == 0);
|
||||
preliminary_state_.time_to_confidence_ms == 0);
|
||||
}
|
||||
|
||||
void AdaptiveModeLevelEstimator::Reset() {
|
||||
ResetLevelEstimatorState(preliminary_state_);
|
||||
ResetLevelEstimatorState(reliable_state_);
|
||||
level_dbfs_ = ComputeLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs,
|
||||
initial_saturation_margin_db_,
|
||||
extra_saturation_margin_db_);
|
||||
level_dbfs_ = ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs);
|
||||
num_adjacent_speech_frames_ = 0;
|
||||
}
|
||||
|
||||
void AdaptiveModeLevelEstimator::ResetLevelEstimatorState(
|
||||
LevelEstimatorState& state) const {
|
||||
state.time_to_full_buffer_ms = kFullBufferSizeMs;
|
||||
state.level_dbfs.numerator = 0.f;
|
||||
state.level_dbfs.denominator = 0.f;
|
||||
ResetSaturationProtectorState(initial_saturation_margin_db_,
|
||||
state.saturation_protector);
|
||||
state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
|
||||
state.level_dbfs.numerator = kInitialSpeechLevelEstimateDbfs;
|
||||
state.level_dbfs.denominator = 1.0f;
|
||||
}
|
||||
|
||||
void AdaptiveModeLevelEstimator::DumpDebugData() const {
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs", level_dbfs_);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_num_adjacent_speech_frames",
|
||||
num_adjacent_speech_frames_);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_level_estimate_num",
|
||||
preliminary_state_.level_dbfs.numerator);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_level_estimate_den",
|
||||
preliminary_state_.level_dbfs.denominator);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_saturation_margin_db",
|
||||
preliminary_state_.saturation_protector.margin_db);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_time_to_full_buffer_ms",
|
||||
preliminary_state_.time_to_full_buffer_ms);
|
||||
apm_data_dumper_->DumpRaw("agc2_adaptive_reliable_time_to_full_buffer_ms",
|
||||
reliable_state_.time_to_full_buffer_ms);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_num_adjacent_speech_frames",
|
||||
num_adjacent_speech_frames_);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_preliminary_level_estimate_num",
|
||||
preliminary_state_.level_dbfs.numerator);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_preliminary_level_estimate_den",
|
||||
preliminary_state_.level_dbfs.denominator);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
|
||||
preliminary_state_.time_to_confidence_ms);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
|
||||
reliable_state_.time_to_confidence_ms);
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
#include <type_traits>
|
||||
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
#include "modules/audio_processing/agc2/saturation_protector.h"
|
||||
#include "modules/audio_processing/agc2/vad_with_level.h"
|
||||
#include "modules/audio_processing/include/audio_processing.h"
|
||||
|
||||
@ -29,12 +28,8 @@ class AdaptiveModeLevelEstimator {
|
||||
AdaptiveModeLevelEstimator(const AdaptiveModeLevelEstimator&) = delete;
|
||||
AdaptiveModeLevelEstimator& operator=(const AdaptiveModeLevelEstimator&) =
|
||||
delete;
|
||||
AdaptiveModeLevelEstimator(
|
||||
ApmDataDumper* apm_data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator level_estimator,
|
||||
int adjacent_speech_frames_threshold,
|
||||
float initial_saturation_margin_db,
|
||||
float extra_saturation_margin_db);
|
||||
AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper,
|
||||
int adjacent_speech_frames_threshold);
|
||||
|
||||
// Updates the level estimation.
|
||||
void Update(const VadLevelAnalyzer::Result& vad_data);
|
||||
@ -57,10 +52,9 @@ class AdaptiveModeLevelEstimator {
|
||||
float denominator;
|
||||
float GetRatio() const;
|
||||
};
|
||||
// TODO(crbug.com/webrtc/7494): Remove time_to_full_buffer_ms if redundant.
|
||||
int time_to_full_buffer_ms;
|
||||
// TODO(crbug.com/webrtc/7494): Remove time_to_confidence_ms if redundant.
|
||||
int time_to_confidence_ms;
|
||||
Ratio level_dbfs;
|
||||
SaturationProtectorState saturation_protector;
|
||||
};
|
||||
static_assert(std::is_trivially_copyable<LevelEstimatorState>::value, "");
|
||||
|
||||
@ -70,11 +64,7 @@ class AdaptiveModeLevelEstimator {
|
||||
|
||||
ApmDataDumper* const apm_data_dumper_;
|
||||
|
||||
const AudioProcessing::Config::GainController2::LevelEstimator
|
||||
level_estimator_type_;
|
||||
const int adjacent_speech_frames_threshold_;
|
||||
const float initial_saturation_margin_db_;
|
||||
const float extra_saturation_margin_db_;
|
||||
LevelEstimatorState preliminary_state_;
|
||||
LevelEstimatorState reliable_state_;
|
||||
float level_dbfs_;
|
||||
|
||||
@ -19,22 +19,34 @@
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
constexpr float kInitialSaturationMarginDb = 20.f;
|
||||
constexpr float kExtraSaturationMarginDb = 2.f;
|
||||
// Number of speech frames that the level estimator must observe in order to
|
||||
// become confident about the estimated level.
|
||||
constexpr int kNumFramesToConfidence =
|
||||
kLevelEstimatorTimeToConfidenceMs / kFrameDurationMs;
|
||||
static_assert(kNumFramesToConfidence > 0, "");
|
||||
|
||||
static_assert(kInitialSpeechLevelEstimateDbfs < 0.f, "");
|
||||
constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.f;
|
||||
constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.f;
|
||||
// Fake levels and speech probabilities used in the tests.
|
||||
static_assert(kInitialSpeechLevelEstimateDbfs < 0.0f, "");
|
||||
constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.0f;
|
||||
constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.0f;
|
||||
static_assert(kVadLevelRms < kVadLevelPeak, "");
|
||||
static_assert(kVadLevelRms > kInitialSpeechLevelEstimateDbfs, "");
|
||||
static_assert(kVadLevelRms - kInitialSpeechLevelEstimateDbfs > 5.0f,
|
||||
"Adjust `kVadLevelRms` so that the difference from the initial "
|
||||
"level is wide enough for the tests.");
|
||||
|
||||
constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.f,
|
||||
constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.0f,
|
||||
kVadLevelRms, kVadLevelPeak};
|
||||
constexpr VadLevelAnalyzer::Result kVadDataNonSpeech{
|
||||
/*speech_probability=*/kVadConfidenceThreshold / 2.f, kVadLevelRms,
|
||||
/*speech_probability=*/kVadConfidenceThreshold / 2.0f, kVadLevelRms,
|
||||
kVadLevelPeak};
|
||||
|
||||
constexpr float kMinSpeechProbability = 0.f;
|
||||
constexpr float kMaxSpeechProbability = 1.f;
|
||||
constexpr float kMinSpeechProbability = 0.0f;
|
||||
constexpr float kMaxSpeechProbability = 1.0f;
|
||||
|
||||
constexpr float kConvergenceSpeedTestsLevelTolerance = 0.5f;
|
||||
|
||||
// Provides the `vad_level` value `num_iterations` times to `level_estimator`.
|
||||
void RunOnConstantLevel(int num_iterations,
|
||||
const VadLevelAnalyzer::Result& vad_level,
|
||||
AdaptiveModeLevelEstimator& level_estimator) {
|
||||
@ -43,172 +55,125 @@ void RunOnConstantLevel(int num_iterations,
|
||||
}
|
||||
}
|
||||
|
||||
// Level estimator with data dumper.
|
||||
struct TestLevelEstimator {
|
||||
TestLevelEstimator()
|
||||
: data_dumper(0),
|
||||
estimator(std::make_unique<AdaptiveModeLevelEstimator>(
|
||||
&data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator::kRms,
|
||||
/*adjacent_speech_frames_threshold=*/1,
|
||||
kInitialSaturationMarginDb,
|
||||
kExtraSaturationMarginDb)) {}
|
||||
/*adjacent_speech_frames_threshold=*/1)) {}
|
||||
ApmDataDumper data_dumper;
|
||||
std::unique_ptr<AdaptiveModeLevelEstimator> estimator;
|
||||
};
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
|
||||
EstimatorShouldNotCrash) {
|
||||
// Checks the initially estimated level.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator, CheckInitialEstimate) {
|
||||
TestLevelEstimator level_estimator;
|
||||
|
||||
VadLevelAnalyzer::Result vad_level{kMaxSpeechProbability, /*rms_dbfs=*/-20.f,
|
||||
/*peak_dbfs=*/-10.f};
|
||||
level_estimator.estimator->Update(vad_level);
|
||||
static_cast<void>(level_estimator.estimator->level_dbfs());
|
||||
EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
|
||||
kInitialSpeechLevelEstimateDbfs);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) {
|
||||
// Checks that the level estimator converges to a constant input speech level.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator, LevelStabilizes) {
|
||||
TestLevelEstimator level_estimator;
|
||||
|
||||
constexpr float kSpeechPeakDbfs = -15.f;
|
||||
RunOnConstantLevel(100,
|
||||
VadLevelAnalyzer::Result{kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kSpeechPeakDbfs -
|
||||
kInitialSaturationMarginDb,
|
||||
kSpeechPeakDbfs},
|
||||
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
|
||||
*level_estimator.estimator);
|
||||
|
||||
EXPECT_NEAR(
|
||||
level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb,
|
||||
kSpeechPeakDbfs, 0.1f);
|
||||
const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
|
||||
RunOnConstantLevel(/*num_iterations=*/1, kVadDataSpeech,
|
||||
*level_estimator.estimator);
|
||||
EXPECT_NEAR(level_estimator.estimator->level_dbfs(), estimated_level_dbfs,
|
||||
0.1f);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
|
||||
EstimatorIgnoresZeroProbabilityFrames) {
|
||||
// Checks that the level controller does not become confident when too few
|
||||
// speech frames are observed.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator, IsNotConfident) {
|
||||
TestLevelEstimator level_estimator;
|
||||
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence / 2,
|
||||
kVadDataSpeech, *level_estimator.estimator);
|
||||
EXPECT_FALSE(level_estimator.estimator->IsConfident());
|
||||
}
|
||||
|
||||
// Run for one second of fake audio.
|
||||
constexpr float kSpeechRmsDbfs = -25.f;
|
||||
RunOnConstantLevel(100,
|
||||
VadLevelAnalyzer::Result{kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kSpeechRmsDbfs -
|
||||
kInitialSaturationMarginDb,
|
||||
/*peak_dbfs=*/kSpeechRmsDbfs},
|
||||
// Checks that the level controller becomes confident when enough speech frames
|
||||
// are observed.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator, IsConfident) {
|
||||
TestLevelEstimator level_estimator;
|
||||
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
|
||||
*level_estimator.estimator);
|
||||
EXPECT_TRUE(level_estimator.estimator->IsConfident());
|
||||
}
|
||||
|
||||
// Run for one more second, but mark as not speech.
|
||||
constexpr float kNoiseRmsDbfs = 0.f;
|
||||
RunOnConstantLevel(100,
|
||||
// Checks that the estimated level is not affected by the level of non-speech
|
||||
// frames.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator,
|
||||
EstimatorIgnoresNonSpeechFrames) {
|
||||
TestLevelEstimator level_estimator;
|
||||
// Simulate speech.
|
||||
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
|
||||
*level_estimator.estimator);
|
||||
const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
|
||||
// Simulate full-scale non-speech.
|
||||
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
|
||||
VadLevelAnalyzer::Result{kMinSpeechProbability,
|
||||
/*rms_dbfs=*/kNoiseRmsDbfs,
|
||||
/*peak_dbfs=*/kNoiseRmsDbfs},
|
||||
/*rms_dbfs=*/0.0f,
|
||||
/*peak_dbfs=*/0.0f},
|
||||
*level_estimator.estimator);
|
||||
|
||||
// Level should not have changed.
|
||||
EXPECT_NEAR(
|
||||
level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb,
|
||||
kSpeechRmsDbfs, 0.1f);
|
||||
// No estimated level change is expected.
|
||||
EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
|
||||
estimated_level_dbfs);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) {
|
||||
// Checks the convergence speed of the estimator before it becomes confident.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator,
|
||||
ConvergenceSpeedBeforeConfidence) {
|
||||
TestLevelEstimator level_estimator;
|
||||
|
||||
// Run for one 'window size' interval.
|
||||
constexpr float kInitialSpeechRmsDbfs = -30.f;
|
||||
RunOnConstantLevel(
|
||||
kFullBufferSizeMs / kFrameDurationMs,
|
||||
VadLevelAnalyzer::Result{
|
||||
kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
|
||||
/*peak_dbfs=*/kInitialSpeechRmsDbfs},
|
||||
*level_estimator.estimator);
|
||||
|
||||
// Run for one half 'window size' interval. This should not be enough to
|
||||
// adapt.
|
||||
constexpr float kDifferentSpeechRmsDbfs = -10.f;
|
||||
// It should at most differ by 25% after one half 'window size' interval.
|
||||
// TODO(crbug.com/webrtc/7494): Add constexpr for repeated expressions.
|
||||
const float kMaxDifferenceDb =
|
||||
0.25f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
|
||||
RunOnConstantLevel(
|
||||
static_cast<int>(kFullBufferSizeMs / kFrameDurationMs / 2),
|
||||
VadLevelAnalyzer::Result{
|
||||
kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
|
||||
/*peak_dbfs=*/kDifferentSpeechRmsDbfs},
|
||||
*level_estimator.estimator);
|
||||
EXPECT_GT(std::abs(kDifferentSpeechRmsDbfs -
|
||||
level_estimator.estimator->level_dbfs()),
|
||||
kMaxDifferenceDb);
|
||||
|
||||
// Run for some more time. Afterwards, we should have adapted.
|
||||
RunOnConstantLevel(
|
||||
static_cast<int>(3 * kFullBufferSizeMs / kFrameDurationMs),
|
||||
VadLevelAnalyzer::Result{
|
||||
kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
|
||||
/*peak_dbfs=*/kDifferentSpeechRmsDbfs},
|
||||
*level_estimator.estimator);
|
||||
EXPECT_NEAR(
|
||||
level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb,
|
||||
kDifferentSpeechRmsDbfs, kMaxDifferenceDb * 0.5f);
|
||||
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
|
||||
*level_estimator.estimator);
|
||||
EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs,
|
||||
kConvergenceSpeedTestsLevelTolerance);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
|
||||
ResetGivesFastAdaptation) {
|
||||
// Checks the convergence speed of the estimator after it becomes confident.
|
||||
TEST(GainController2AdaptiveModeLevelEstimator,
|
||||
ConvergenceSpeedAfterConfidence) {
|
||||
TestLevelEstimator level_estimator;
|
||||
|
||||
// Run the level estimator for one window size interval. This gives time to
|
||||
// adapt.
|
||||
constexpr float kInitialSpeechRmsDbfs = -30.f;
|
||||
// Reach confidence using the initial level estimate.
|
||||
RunOnConstantLevel(
|
||||
kFullBufferSizeMs / kFrameDurationMs,
|
||||
/*num_iterations=*/kNumFramesToConfidence,
|
||||
VadLevelAnalyzer::Result{
|
||||
kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
|
||||
/*peak_dbfs=*/kInitialSpeechRmsDbfs},
|
||||
/*rms_dbfs=*/kInitialSpeechLevelEstimateDbfs,
|
||||
/*peak_dbfs=*/kInitialSpeechLevelEstimateDbfs + 6.0f},
|
||||
*level_estimator.estimator);
|
||||
|
||||
constexpr float kDifferentSpeechRmsDbfs = -10.f;
|
||||
// Reset and run one half window size interval.
|
||||
level_estimator.estimator->Reset();
|
||||
|
||||
// No estimate change should occur, but confidence is achieved.
|
||||
ASSERT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
|
||||
kInitialSpeechLevelEstimateDbfs);
|
||||
ASSERT_TRUE(level_estimator.estimator->IsConfident());
|
||||
// After confidence.
|
||||
constexpr float kConvergenceTimeAfterConfidenceNumFrames = 600; // 6 seconds.
|
||||
static_assert(
|
||||
kConvergenceTimeAfterConfidenceNumFrames > kNumFramesToConfidence, "");
|
||||
RunOnConstantLevel(
|
||||
kFullBufferSizeMs / kFrameDurationMs / 2,
|
||||
VadLevelAnalyzer::Result{
|
||||
kMaxSpeechProbability,
|
||||
/*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
|
||||
/*peak_dbfs=*/kDifferentSpeechRmsDbfs},
|
||||
*level_estimator.estimator);
|
||||
|
||||
// The level should be close to 'kDifferentSpeechRmsDbfs'.
|
||||
const float kMaxDifferenceDb =
|
||||
0.1f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
|
||||
EXPECT_LT(std::abs(kDifferentSpeechRmsDbfs -
|
||||
(level_estimator.estimator->level_dbfs() -
|
||||
kExtraSaturationMarginDb)),
|
||||
kMaxDifferenceDb);
|
||||
/*num_iterations=*/kConvergenceTimeAfterConfidenceNumFrames,
|
||||
kVadDataSpeech, *level_estimator.estimator);
|
||||
EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs,
|
||||
kConvergenceSpeedTestsLevelTolerance);
|
||||
}
|
||||
|
||||
struct TestConfig {
|
||||
int min_consecutive_speech_frames;
|
||||
float initial_saturation_margin_db;
|
||||
float extra_saturation_margin_db;
|
||||
class AdaptiveModeLevelEstimatorParametrization
|
||||
: public ::testing::TestWithParam<int> {
|
||||
protected:
|
||||
int adjacent_speech_frames_threshold() const { return GetParam(); }
|
||||
};
|
||||
|
||||
class AdaptiveModeLevelEstimatorTest
|
||||
: public ::testing::TestWithParam<TestConfig> {};
|
||||
|
||||
TEST_P(AdaptiveModeLevelEstimatorTest, DoNotAdaptToShortSpeechSegments) {
|
||||
const auto params = GetParam();
|
||||
TEST_P(AdaptiveModeLevelEstimatorParametrization,
|
||||
DoNotAdaptToShortSpeechSegments) {
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
AdaptiveModeLevelEstimator level_estimator(
|
||||
&apm_data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator::kRms,
|
||||
params.min_consecutive_speech_frames, params.initial_saturation_margin_db,
|
||||
params.extra_saturation_margin_db);
|
||||
&apm_data_dumper, adjacent_speech_frames_threshold());
|
||||
const float initial_level = level_estimator.level_dbfs();
|
||||
ASSERT_LT(initial_level, kVadDataSpeech.rms_dbfs);
|
||||
for (int i = 0; i < params.min_consecutive_speech_frames - 1; ++i) {
|
||||
ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs);
|
||||
for (int i = 0; i < adjacent_speech_frames_threshold() - 1; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
level_estimator.Update(kVadDataSpeech);
|
||||
EXPECT_EQ(initial_level, level_estimator.level_dbfs());
|
||||
@ -217,26 +182,21 @@ TEST_P(AdaptiveModeLevelEstimatorTest, DoNotAdaptToShortSpeechSegments) {
|
||||
EXPECT_EQ(initial_level, level_estimator.level_dbfs());
|
||||
}
|
||||
|
||||
TEST_P(AdaptiveModeLevelEstimatorTest, AdaptToEnoughSpeechSegments) {
|
||||
const auto params = GetParam();
|
||||
TEST_P(AdaptiveModeLevelEstimatorParametrization, AdaptToEnoughSpeechSegments) {
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
AdaptiveModeLevelEstimator level_estimator(
|
||||
&apm_data_dumper,
|
||||
AudioProcessing::Config::GainController2::LevelEstimator::kRms,
|
||||
params.min_consecutive_speech_frames, params.initial_saturation_margin_db,
|
||||
params.extra_saturation_margin_db);
|
||||
&apm_data_dumper, adjacent_speech_frames_threshold());
|
||||
const float initial_level = level_estimator.level_dbfs();
|
||||
ASSERT_LT(initial_level, kVadDataSpeech.rms_dbfs);
|
||||
for (int i = 0; i < params.min_consecutive_speech_frames; ++i) {
|
||||
ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs);
|
||||
for (int i = 0; i < adjacent_speech_frames_threshold(); ++i) {
|
||||
level_estimator.Update(kVadDataSpeech);
|
||||
}
|
||||
EXPECT_LT(initial_level, level_estimator.level_dbfs());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AutomaticGainController2,
|
||||
AdaptiveModeLevelEstimatorTest,
|
||||
::testing::Values(TestConfig{1, 0.f, 0.f},
|
||||
TestConfig{9, 0.f, 0.f}));
|
||||
INSTANTIATE_TEST_SUITE_P(GainController2,
|
||||
AdaptiveModeLevelEstimatorParametrization,
|
||||
::testing::Values(1, 9, 17));
|
||||
|
||||
} // namespace
|
||||
} // namespace webrtc
|
||||
|
||||
@ -11,20 +11,19 @@
|
||||
#ifndef MODULES_AUDIO_PROCESSING_AGC2_AGC2_COMMON_H_
|
||||
#define MODULES_AUDIO_PROCESSING_AGC2_AGC2_COMMON_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
constexpr float kMinFloatS16Value = -32768.0f;
|
||||
constexpr float kMaxFloatS16Value = 32767.0f;
|
||||
constexpr float kMaxAbsFloatS16Value = 32768.0f;
|
||||
|
||||
// Minimum audio level in dBFS scale for S16 samples.
|
||||
constexpr float kMinLevelDbfs = -90.31f;
|
||||
|
||||
constexpr int kFrameDurationMs = 10;
|
||||
constexpr int kSubFramesInFrame = 20;
|
||||
constexpr int kMaximalNumberOfSamplesPerChannel = 480;
|
||||
|
||||
constexpr float kAttackFilterConstant = 0.0f;
|
||||
|
||||
// Adaptive digital gain applier settings below.
|
||||
constexpr float kHeadroomDbfs = 1.0f;
|
||||
constexpr float kMaxGainDb = 30.0f;
|
||||
@ -37,43 +36,29 @@ constexpr float kLimiterThresholdForAgcGainDbfs = -kHeadroomDbfs;
|
||||
// gain reduction.
|
||||
constexpr float kVadConfidenceThreshold = 0.95f;
|
||||
|
||||
// The amount of 'memory' of the Level Estimator. Decides leak factors.
|
||||
constexpr int kFullBufferSizeMs = 1200;
|
||||
constexpr float kFullBufferLeakFactor = 1.0f - 1.0f / kFullBufferSizeMs;
|
||||
|
||||
constexpr float kInitialSpeechLevelEstimateDbfs = -30.0f;
|
||||
// Adaptive digital level estimator parameters.
|
||||
// Number of milliseconds of speech frames to observe to make the estimator
|
||||
// confident.
|
||||
constexpr float kLevelEstimatorTimeToConfidenceMs = 400;
|
||||
constexpr float kLevelEstimatorLeakFactor =
|
||||
1.0f - 1.0f / kLevelEstimatorTimeToConfidenceMs;
|
||||
|
||||
// Robust VAD probability and speech decisions.
|
||||
constexpr int kDefaultVadRnnResetPeriodMs = 1500;
|
||||
static_assert(kDefaultVadRnnResetPeriodMs % kFrameDurationMs == 0, "");
|
||||
constexpr float kDefaultSmoothedVadProbabilityAttack = 1.0f;
|
||||
constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 1;
|
||||
constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 12;
|
||||
|
||||
// Saturation Protector settings.
|
||||
constexpr float kDefaultInitialSaturationMarginDb = 20.0f;
|
||||
constexpr float kDefaultExtraSaturationMarginDb = 2.0f;
|
||||
constexpr float kSaturationProtectorInitialHeadroomDb = 20.0f;
|
||||
constexpr float kSaturationProtectorExtraHeadroomDb = 5.0f;
|
||||
constexpr int kSaturationProtectorBufferSize = 4;
|
||||
|
||||
constexpr int kPeakEnveloperSuperFrameLengthMs = 400;
|
||||
static_assert(kFullBufferSizeMs % kPeakEnveloperSuperFrameLengthMs == 0,
|
||||
"Full buffer size should be a multiple of super frame length for "
|
||||
"optimal Saturation Protector performance.");
|
||||
|
||||
constexpr int kPeakEnveloperBufferSize =
|
||||
kFullBufferSizeMs / kPeakEnveloperSuperFrameLengthMs + 1;
|
||||
|
||||
// This value is 10 ** (-1/20 * frame_size_ms / satproc_attack_ms),
|
||||
// where satproc_attack_ms is 5000.
|
||||
constexpr float kSaturationProtectorAttackConstant = 0.9988493699365052f;
|
||||
|
||||
// This value is 10 ** (-1/20 * frame_size_ms / satproc_decay_ms),
|
||||
// where satproc_decay_ms is 1000.
|
||||
constexpr float kSaturationProtectorDecayConstant = 0.9997697679981565f;
|
||||
|
||||
// This is computed from kDecayMs by
|
||||
// 10 ** (-1/20 * subframe_duration / kDecayMs).
|
||||
// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.
|
||||
// kDecayMs is defined in agc2_testing_common.h
|
||||
constexpr float kDecayFilterConstant = 0.9998848773724686f;
|
||||
// Set the initial speech level estimate so that `kInitialAdaptiveDigitalGainDb`
|
||||
// is applied at the beginning of the call.
|
||||
constexpr float kInitialSpeechLevelEstimateDbfs =
|
||||
-kSaturationProtectorExtraHeadroomDb -
|
||||
kSaturationProtectorInitialHeadroomDb - kInitialAdaptiveDigitalGainDb -
|
||||
kHeadroomDbfs;
|
||||
|
||||
// Number of interpolation points for each region of the limiter.
|
||||
// These values have been tuned to limit the interpolated gain curve error given
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
TEST(AutomaticGainController2Common, TestLinSpace) {
|
||||
TEST(GainController2TestingCommon, LinSpace) {
|
||||
std::vector<double> points1 = test::LinSpace(-1.0, 2.0, 4);
|
||||
const std::vector<double> expected_points1{{-1.0, 0.0, 1.0, 2.0}};
|
||||
EXPECT_EQ(expected_points1, points1);
|
||||
|
||||
@ -22,6 +22,14 @@ namespace {
|
||||
|
||||
constexpr float kInitialFilterStateLevel = 0.f;
|
||||
|
||||
// Instant attack.
|
||||
constexpr float kAttackFilterConstant = 0.f;
|
||||
// This is computed from kDecayMs by
|
||||
// 10 ** (-1/20 * subframe_duration / kDecayMs).
|
||||
// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.
|
||||
// kDecayMs is defined in agc2_testing_common.h
|
||||
constexpr float kDecayFilterConstant = 0.9998848773724686f;
|
||||
|
||||
} // namespace
|
||||
|
||||
FixedDigitalLevelEstimator::FixedDigitalLevelEstimator(
|
||||
|
||||
@ -101,25 +101,25 @@ float TimeMsToDecreaseLevel(int sample_rate_hz,
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(AutomaticGainController2LevelEstimator, EstimatorShouldNotCrash) {
|
||||
TEST(GainController2FixedDigitalLevelEstimator, EstimatorShouldNotCrash) {
|
||||
TestLevelEstimator(8000, 1, 0, std::numeric_limits<float>::lowest(),
|
||||
std::numeric_limits<float>::max());
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2LevelEstimator,
|
||||
TEST(GainController2FixedDigitalLevelEstimator,
|
||||
EstimatorShouldEstimateConstantLevel) {
|
||||
TestLevelEstimator(10000, 1, kInputLevel, kInputLevel * 0.99,
|
||||
kInputLevel * 1.01);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2LevelEstimator,
|
||||
TEST(GainController2FixedDigitalLevelEstimator,
|
||||
EstimatorShouldEstimateConstantLevelForManyChannels) {
|
||||
constexpr size_t num_channels = 10;
|
||||
TestLevelEstimator(20000, num_channels, kInputLevel, kInputLevel * 0.99,
|
||||
kInputLevel * 1.01);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForLowLevel) {
|
||||
TEST(GainController2FixedDigitalLevelEstimator, TimeToDecreaseForLowLevel) {
|
||||
constexpr float kLevelReductionDb = 25;
|
||||
constexpr float kInitialLowLevel = -40;
|
||||
constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs;
|
||||
@ -131,7 +131,8 @@ TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForLowLevel) {
|
||||
EXPECT_LE(time_to_decrease, kExpectedTime * 1.1);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForFullScaleLevel) {
|
||||
TEST(GainController2FixedDigitalLevelEstimator,
|
||||
TimeToDecreaseForFullScaleLevel) {
|
||||
constexpr float kLevelReductionDb = 25;
|
||||
constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs;
|
||||
|
||||
@ -142,7 +143,7 @@ TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForFullScaleLevel) {
|
||||
EXPECT_LE(time_to_decrease, kExpectedTime * 1.1);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2LevelEstimator,
|
||||
TEST(GainController2FixedDigitalLevelEstimator,
|
||||
TimeToDecreaseForMultipleChannels) {
|
||||
constexpr float kLevelReductionDb = 25;
|
||||
constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs;
|
||||
|
||||
@ -75,7 +75,7 @@ class InterpolatedGainCurve {
|
||||
private:
|
||||
// For comparing 'approximation_params_*_' with ones computed by
|
||||
// ComputeInterpolatedGainCurve.
|
||||
FRIEND_TEST_ALL_PREFIXES(AutomaticGainController2InterpolatedGainCurve,
|
||||
FRIEND_TEST_ALL_PREFIXES(GainController2InterpolatedGainCurve,
|
||||
CheckApproximationParams);
|
||||
|
||||
struct RegionLogger {
|
||||
|
||||
@ -34,7 +34,7 @@ const LimiterDbGainCurve limiter;
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CreateUse) {
|
||||
TEST(GainController2InterpolatedGainCurve, CreateUse) {
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
const auto levels = test::LinSpace(
|
||||
@ -44,7 +44,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CreateUse) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CheckValidOutput) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckValidOutput) {
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
const auto levels = test::LinSpace(
|
||||
@ -57,7 +57,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckValidOutput) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CheckMonotonicity) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckMonotonicity) {
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
const auto levels = test::LinSpace(
|
||||
@ -71,7 +71,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckMonotonicity) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximation) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckApproximation) {
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
const auto levels = test::LinSpace(
|
||||
@ -84,7 +84,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximation) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CheckRegionBoundaries) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckRegionBoundaries) {
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
const std::vector<double> levels{
|
||||
@ -102,7 +102,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckRegionBoundaries) {
|
||||
EXPECT_EQ(1ul, stats.look_ups_saturation_region);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CheckIdentityRegion) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckIdentityRegion) {
|
||||
constexpr size_t kNumSteps = 10;
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
@ -120,8 +120,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckIdentityRegion) {
|
||||
EXPECT_EQ(0ul, stats.look_ups_saturation_region);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve,
|
||||
CheckNoOverApproximationKnee) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationKnee) {
|
||||
constexpr size_t kNumSteps = 10;
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
@ -142,8 +141,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve,
|
||||
EXPECT_EQ(0ul, stats.look_ups_saturation_region);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve,
|
||||
CheckNoOverApproximationBeyondKnee) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationBeyondKnee) {
|
||||
constexpr size_t kNumSteps = 10;
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
|
||||
@ -164,7 +162,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve,
|
||||
EXPECT_EQ(0ul, stats.look_ups_saturation_region);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve,
|
||||
TEST(GainController2InterpolatedGainCurve,
|
||||
CheckNoOverApproximationWithSaturation) {
|
||||
constexpr size_t kNumSteps = 3;
|
||||
InterpolatedGainCurve igc(&apm_data_dumper, "");
|
||||
@ -184,7 +182,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve,
|
||||
EXPECT_EQ(kNumSteps, stats.look_ups_saturation_region);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximationParams) {
|
||||
TEST(GainController2InterpolatedGainCurve, CheckApproximationParams) {
|
||||
test::InterpolatedParameters parameters =
|
||||
test::ComputeInterpolatedGainCurveApproximationParams();
|
||||
|
||||
|
||||
@ -184,7 +184,7 @@ class NoiseFloorEstimator : public NoiseLevelEstimator {
|
||||
const float frame_energy = FrameEnergy(frame);
|
||||
if (frame_energy <= min_noise_energy_) {
|
||||
// Ignore frames when muted or below the minimum measurable energy.
|
||||
data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level",
|
||||
data_dumper_->DumpRaw("agc2_noise_floor_estimator_preliminary_level",
|
||||
noise_energy_);
|
||||
return EnergyToDbfs(noise_energy_, frame.samples_per_channel());
|
||||
}
|
||||
@ -196,7 +196,7 @@ class NoiseFloorEstimator : public NoiseLevelEstimator {
|
||||
preliminary_noise_energy_ = frame_energy;
|
||||
preliminary_noise_energy_set_ = true;
|
||||
}
|
||||
data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level",
|
||||
data_dumper_->DumpRaw("agc2_noise_floor_estimator_preliminary_level",
|
||||
preliminary_noise_energy_);
|
||||
|
||||
if (counter_ == 0) {
|
||||
|
||||
@ -10,84 +10,59 @@
|
||||
|
||||
#include "modules/audio_processing/agc2/saturation_protector.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
|
||||
#include "modules/audio_processing/logging/apm_data_dumper.h"
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/numerics/safe_minmax.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
constexpr float kMinLevelDbfs = -90.f;
|
||||
constexpr int kPeakEnveloperSuperFrameLengthMs = 400;
|
||||
constexpr float kMinMarginDb = 12.0f;
|
||||
constexpr float kMaxMarginDb = 25.0f;
|
||||
constexpr float kAttack = 0.9988493699365052f;
|
||||
constexpr float kDecay = 0.9997697679981565f;
|
||||
|
||||
// Min/max margins are based on speech crest-factor.
|
||||
constexpr float kMinMarginDb = 12.f;
|
||||
constexpr float kMaxMarginDb = 25.f;
|
||||
|
||||
using saturation_protector_impl::RingBuffer;
|
||||
|
||||
} // namespace
|
||||
|
||||
bool RingBuffer::operator==(const RingBuffer& b) const {
|
||||
RTC_DCHECK_LE(size_, buffer_.size());
|
||||
RTC_DCHECK_LE(b.size_, b.buffer_.size());
|
||||
if (size_ != b.size_) {
|
||||
return false;
|
||||
// Saturation protector state. Defined outside of `SaturationProtectorImpl` to
|
||||
// implement check-point and restore ops.
|
||||
struct SaturationProtectorState {
|
||||
bool operator==(const SaturationProtectorState& s) const {
|
||||
return headroom_db == s.headroom_db &&
|
||||
peak_delay_buffer == s.peak_delay_buffer &&
|
||||
max_peaks_dbfs == s.max_peaks_dbfs &&
|
||||
time_since_push_ms == s.time_since_push_ms;
|
||||
}
|
||||
for (int i = 0, i0 = FrontIndex(), i1 = b.FrontIndex(); i < size_;
|
||||
++i, ++i0, ++i1) {
|
||||
if (buffer_[i0 % buffer_.size()] != b.buffer_[i1 % b.buffer_.size()]) {
|
||||
return false;
|
||||
}
|
||||
inline bool operator!=(const SaturationProtectorState& s) const {
|
||||
return !(*this == s);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void RingBuffer::Reset() {
|
||||
next_ = 0;
|
||||
size_ = 0;
|
||||
}
|
||||
float headroom_db;
|
||||
SaturationProtectorBuffer peak_delay_buffer;
|
||||
float max_peaks_dbfs;
|
||||
int time_since_push_ms; // Time since the last ring buffer push operation.
|
||||
};
|
||||
|
||||
void RingBuffer::PushBack(float v) {
|
||||
RTC_DCHECK_GE(next_, 0);
|
||||
RTC_DCHECK_GE(size_, 0);
|
||||
RTC_DCHECK_LT(next_, buffer_.size());
|
||||
RTC_DCHECK_LE(size_, buffer_.size());
|
||||
buffer_[next_++] = v;
|
||||
if (rtc::SafeEq(next_, buffer_.size())) {
|
||||
next_ = 0;
|
||||
}
|
||||
if (rtc::SafeLt(size_, buffer_.size())) {
|
||||
size_++;
|
||||
}
|
||||
}
|
||||
|
||||
absl::optional<float> RingBuffer::Front() const {
|
||||
if (size_ == 0) {
|
||||
return absl::nullopt;
|
||||
}
|
||||
RTC_DCHECK_LT(FrontIndex(), buffer_.size());
|
||||
return buffer_[FrontIndex()];
|
||||
}
|
||||
|
||||
bool SaturationProtectorState::operator==(
|
||||
const SaturationProtectorState& b) const {
|
||||
return margin_db == b.margin_db && peak_delay_buffer == b.peak_delay_buffer &&
|
||||
max_peaks_dbfs == b.max_peaks_dbfs &&
|
||||
time_since_push_ms == b.time_since_push_ms;
|
||||
}
|
||||
|
||||
void ResetSaturationProtectorState(float initial_margin_db,
|
||||
// Resets the saturation protector state.
|
||||
void ResetSaturationProtectorState(float initial_headroom_db,
|
||||
SaturationProtectorState& state) {
|
||||
state.margin_db = initial_margin_db;
|
||||
state.headroom_db = initial_headroom_db;
|
||||
state.peak_delay_buffer.Reset();
|
||||
state.max_peaks_dbfs = kMinLevelDbfs;
|
||||
state.time_since_push_ms = 0;
|
||||
}
|
||||
|
||||
void UpdateSaturationProtectorState(float speech_peak_dbfs,
|
||||
// Updates `state` by analyzing the estimated speech level `speech_level_dbfs`
|
||||
// and the peak level `peak_dbfs` for an observed frame. `state` must not be
|
||||
// modified without calling this function.
|
||||
void UpdateSaturationProtectorState(float peak_dbfs,
|
||||
float speech_level_dbfs,
|
||||
SaturationProtectorState& state) {
|
||||
// Get the max peak over `kPeakEnveloperSuperFrameLengthMs` ms.
|
||||
state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, speech_peak_dbfs);
|
||||
state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, peak_dbfs);
|
||||
state.time_since_push_ms += kFrameDurationMs;
|
||||
if (rtc::SafeGt(state.time_since_push_ms, kPeakEnveloperSuperFrameLengthMs)) {
|
||||
// Push `max_peaks_dbfs` back into the ring buffer.
|
||||
@ -97,25 +72,117 @@ void UpdateSaturationProtectorState(float speech_peak_dbfs,
|
||||
state.time_since_push_ms = 0;
|
||||
}
|
||||
|
||||
// Update margin by comparing the estimated speech level and the delayed max
|
||||
// speech peak power.
|
||||
// TODO(alessiob): Check with aleloi@ why we use a delay and how to tune it.
|
||||
// Update the headroom by comparing the estimated speech level and the delayed
|
||||
// max speech peak.
|
||||
const float delayed_peak_dbfs =
|
||||
state.peak_delay_buffer.Front().value_or(state.max_peaks_dbfs);
|
||||
const float difference_db = delayed_peak_dbfs - speech_level_dbfs;
|
||||
if (difference_db > state.margin_db) {
|
||||
if (difference_db > state.headroom_db) {
|
||||
// Attack.
|
||||
state.margin_db =
|
||||
state.margin_db * kSaturationProtectorAttackConstant +
|
||||
difference_db * (1.f - kSaturationProtectorAttackConstant);
|
||||
state.headroom_db =
|
||||
state.headroom_db * kAttack + difference_db * (1.0f - kAttack);
|
||||
} else {
|
||||
// Decay.
|
||||
state.margin_db = state.margin_db * kSaturationProtectorDecayConstant +
|
||||
difference_db * (1.f - kSaturationProtectorDecayConstant);
|
||||
state.headroom_db =
|
||||
state.headroom_db * kDecay + difference_db * (1.0f - kDecay);
|
||||
}
|
||||
|
||||
state.margin_db =
|
||||
rtc::SafeClamp<float>(state.margin_db, kMinMarginDb, kMaxMarginDb);
|
||||
state.headroom_db =
|
||||
rtc::SafeClamp<float>(state.headroom_db, kMinMarginDb, kMaxMarginDb);
|
||||
}
|
||||
|
||||
// Saturation protector which recommends a headroom based on the recent peaks.
|
||||
class SaturationProtectorImpl : public SaturationProtector {
|
||||
public:
|
||||
explicit SaturationProtectorImpl(float initial_headroom_db,
|
||||
float extra_headroom_db,
|
||||
int adjacent_speech_frames_threshold,
|
||||
ApmDataDumper* apm_data_dumper)
|
||||
: apm_data_dumper_(apm_data_dumper),
|
||||
initial_headroom_db_(initial_headroom_db),
|
||||
extra_headroom_db_(extra_headroom_db),
|
||||
adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold) {
|
||||
Reset();
|
||||
}
|
||||
SaturationProtectorImpl(const SaturationProtectorImpl&) = delete;
|
||||
SaturationProtectorImpl& operator=(const SaturationProtectorImpl&) = delete;
|
||||
~SaturationProtectorImpl() = default;
|
||||
|
||||
float HeadroomDb() override { return headroom_db_; }
|
||||
|
||||
void Analyze(float speech_probability,
|
||||
float peak_dbfs,
|
||||
float speech_level_dbfs) override {
|
||||
if (speech_probability < kVadConfidenceThreshold) {
|
||||
// Not a speech frame.
|
||||
if (adjacent_speech_frames_threshold_ > 1) {
|
||||
// When two or more adjacent speech frames are required in order to
|
||||
// update the state, we need to decide whether to discard or confirm the
|
||||
// updates based on the speech sequence length.
|
||||
if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
|
||||
// First non-speech frame after a long enough sequence of speech
|
||||
// frames. Update the reliable state.
|
||||
reliable_state_ = preliminary_state_;
|
||||
} else if (num_adjacent_speech_frames_ > 0) {
|
||||
// First non-speech frame after a too short sequence of speech frames.
|
||||
// Reset to the last reliable state.
|
||||
preliminary_state_ = reliable_state_;
|
||||
}
|
||||
}
|
||||
num_adjacent_speech_frames_ = 0;
|
||||
} else {
|
||||
// Speech frame observed.
|
||||
num_adjacent_speech_frames_++;
|
||||
|
||||
// Update preliminary level estimate.
|
||||
UpdateSaturationProtectorState(peak_dbfs, speech_level_dbfs,
|
||||
preliminary_state_);
|
||||
|
||||
if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
|
||||
// `preliminary_state_` is now reliable. Update the headroom.
|
||||
headroom_db_ = preliminary_state_.headroom_db + extra_headroom_db_;
|
||||
}
|
||||
}
|
||||
DumpDebugData();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
num_adjacent_speech_frames_ = 0;
|
||||
headroom_db_ = initial_headroom_db_ + extra_headroom_db_;
|
||||
ResetSaturationProtectorState(initial_headroom_db_, preliminary_state_);
|
||||
ResetSaturationProtectorState(initial_headroom_db_, reliable_state_);
|
||||
}
|
||||
|
||||
private:
|
||||
void DumpDebugData() {
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_saturation_protector_preliminary_max_peak_dbfs",
|
||||
preliminary_state_.max_peaks_dbfs);
|
||||
apm_data_dumper_->DumpRaw(
|
||||
"agc2_saturation_protector_reliable_max_peak_dbfs",
|
||||
reliable_state_.max_peaks_dbfs);
|
||||
}
|
||||
|
||||
ApmDataDumper* const apm_data_dumper_;
|
||||
const float initial_headroom_db_;
|
||||
const float extra_headroom_db_;
|
||||
const int adjacent_speech_frames_threshold_;
|
||||
int num_adjacent_speech_frames_;
|
||||
float headroom_db_;
|
||||
SaturationProtectorState preliminary_state_;
|
||||
SaturationProtectorState reliable_state_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<SaturationProtector> CreateSaturationProtector(
|
||||
float initial_headroom_db,
|
||||
float extra_headroom_db,
|
||||
int adjacent_speech_frames_threshold,
|
||||
ApmDataDumper* apm_data_dumper) {
|
||||
return std::make_unique<SaturationProtectorImpl>(
|
||||
initial_headroom_db, extra_headroom_db, adjacent_speech_frames_threshold,
|
||||
apm_data_dumper);
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -11,71 +11,36 @@
|
||||
#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
|
||||
#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "absl/types/optional.h"
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
#include "rtc_base/numerics/safe_compare.h"
|
||||
#include <memory>
|
||||
|
||||
namespace webrtc {
|
||||
namespace saturation_protector_impl {
|
||||
class ApmDataDumper;
|
||||
|
||||
// Ring buffer which only supports (i) push back and (ii) read oldest item.
|
||||
class RingBuffer {
|
||||
// Saturation protector. Analyzes peak levels and recommends a headroom to
|
||||
// reduce the chances of clipping.
|
||||
class SaturationProtector {
|
||||
public:
|
||||
bool operator==(const RingBuffer& b) const;
|
||||
inline bool operator!=(const RingBuffer& b) const { return !(*this == b); }
|
||||
virtual ~SaturationProtector() = default;
|
||||
|
||||
// Maximum number of values that the buffer can contain.
|
||||
int Capacity() const { return buffer_.size(); }
|
||||
// Number of values in the buffer.
|
||||
int Size() const { return size_; }
|
||||
// Returns the recommended headroom in dB.
|
||||
virtual float HeadroomDb() = 0;
|
||||
|
||||
void Reset();
|
||||
// Pushes back `v`. If the buffer is full, the oldest value is replaced.
|
||||
void PushBack(float v);
|
||||
// Returns the oldest item in the buffer. Returns an empty value if the
|
||||
// buffer is empty.
|
||||
absl::optional<float> Front() const;
|
||||
// Analyzes the peak level of a 10 ms frame along with its speech probability
|
||||
// and the current speech level estimate to update the recommended headroom.
|
||||
virtual void Analyze(float speech_probability,
|
||||
float peak_dbfs,
|
||||
float speech_level_dbfs) = 0;
|
||||
|
||||
private:
|
||||
inline int FrontIndex() const {
|
||||
return rtc::SafeEq(size_, buffer_.size()) ? next_ : 0;
|
||||
}
|
||||
// `buffer_` has `size_` elements (up to the size of `buffer_`) and `next_` is
|
||||
// the position where the next new value is written in `buffer_`.
|
||||
std::array<float, kPeakEnveloperBufferSize> buffer_;
|
||||
int next_ = 0;
|
||||
int size_ = 0;
|
||||
// Resets the internal state.
|
||||
virtual void Reset() = 0;
|
||||
};
|
||||
|
||||
} // namespace saturation_protector_impl
|
||||
|
||||
// Saturation protector state. Exposed publicly for check-pointing and restore
|
||||
// ops.
|
||||
struct SaturationProtectorState {
|
||||
bool operator==(const SaturationProtectorState& s) const;
|
||||
inline bool operator!=(const SaturationProtectorState& s) const {
|
||||
return !(*this == s);
|
||||
}
|
||||
|
||||
float margin_db; // Recommended margin.
|
||||
saturation_protector_impl::RingBuffer peak_delay_buffer;
|
||||
float max_peaks_dbfs;
|
||||
int time_since_push_ms; // Time since the last ring buffer push operation.
|
||||
};
|
||||
|
||||
// Resets the saturation protector state.
|
||||
void ResetSaturationProtectorState(float initial_margin_db,
|
||||
SaturationProtectorState& state);
|
||||
|
||||
// Updates `state` by analyzing the estimated speech level `speech_level_dbfs`
|
||||
// and the peak power `speech_peak_dbfs` for an observed frame which is
|
||||
// reliably classified as "speech". `state` must not be modified without calling
|
||||
// this function.
|
||||
void UpdateSaturationProtectorState(float speech_peak_dbfs,
|
||||
float speech_level_dbfs,
|
||||
SaturationProtectorState& state);
|
||||
// Creates a saturation protector that starts at `initial_headroom_db`.
|
||||
std::unique_ptr<SaturationProtector> CreateSaturationProtector(
|
||||
float initial_headroom_db,
|
||||
float extra_headroom_db,
|
||||
int adjacent_speech_frames_threshold,
|
||||
ApmDataDumper* apm_data_dumper);
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
|
||||
77
modules/audio_processing/agc2/saturation_protector_buffer.cc
Normal file
77
modules/audio_processing/agc2/saturation_protector_buffer.cc
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
|
||||
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/numerics/safe_compare.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
SaturationProtectorBuffer::SaturationProtectorBuffer() = default;
|
||||
|
||||
SaturationProtectorBuffer::~SaturationProtectorBuffer() = default;
|
||||
|
||||
bool SaturationProtectorBuffer::operator==(
|
||||
const SaturationProtectorBuffer& b) const {
|
||||
RTC_DCHECK_LE(size_, buffer_.size());
|
||||
RTC_DCHECK_LE(b.size_, b.buffer_.size());
|
||||
if (size_ != b.size_) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0, i0 = FrontIndex(), i1 = b.FrontIndex(); i < size_;
|
||||
++i, ++i0, ++i1) {
|
||||
if (buffer_[i0 % buffer_.size()] != b.buffer_[i1 % b.buffer_.size()]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int SaturationProtectorBuffer::Capacity() const {
|
||||
return buffer_.size();
|
||||
}
|
||||
|
||||
int SaturationProtectorBuffer::Size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
void SaturationProtectorBuffer::Reset() {
|
||||
next_ = 0;
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
void SaturationProtectorBuffer::PushBack(float v) {
|
||||
RTC_DCHECK_GE(next_, 0);
|
||||
RTC_DCHECK_GE(size_, 0);
|
||||
RTC_DCHECK_LT(next_, buffer_.size());
|
||||
RTC_DCHECK_LE(size_, buffer_.size());
|
||||
buffer_[next_++] = v;
|
||||
if (rtc::SafeEq(next_, buffer_.size())) {
|
||||
next_ = 0;
|
||||
}
|
||||
if (rtc::SafeLt(size_, buffer_.size())) {
|
||||
size_++;
|
||||
}
|
||||
}
|
||||
|
||||
absl::optional<float> SaturationProtectorBuffer::Front() const {
|
||||
if (size_ == 0) {
|
||||
return absl::nullopt;
|
||||
}
|
||||
RTC_DCHECK_LT(FrontIndex(), buffer_.size());
|
||||
return buffer_[FrontIndex()];
|
||||
}
|
||||
|
||||
int SaturationProtectorBuffer::FrontIndex() const {
|
||||
return rtc::SafeEq(size_, buffer_.size()) ? next_ : 0;
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
59
modules/audio_processing/agc2/saturation_protector_buffer.h
Normal file
59
modules/audio_processing/agc2/saturation_protector_buffer.h
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_
|
||||
#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "absl/types/optional.h"
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
// Ring buffer for the saturation protector which only supports (i) push back
|
||||
// and (ii) read oldest item.
|
||||
class SaturationProtectorBuffer {
|
||||
public:
|
||||
SaturationProtectorBuffer();
|
||||
~SaturationProtectorBuffer();
|
||||
|
||||
bool operator==(const SaturationProtectorBuffer& b) const;
|
||||
inline bool operator!=(const SaturationProtectorBuffer& b) const {
|
||||
return !(*this == b);
|
||||
}
|
||||
|
||||
// Maximum number of values that the buffer can contain.
|
||||
int Capacity() const;
|
||||
|
||||
// Number of values in the buffer.
|
||||
int Size() const;
|
||||
|
||||
void Reset();
|
||||
|
||||
// Pushes back `v`. If the buffer is full, the oldest value is replaced.
|
||||
void PushBack(float v);
|
||||
|
||||
// Returns the oldest item in the buffer. Returns an empty value if the
|
||||
// buffer is empty.
|
||||
absl::optional<float> Front() const;
|
||||
|
||||
private:
|
||||
int FrontIndex() const;
|
||||
// `buffer_` has `size_` elements (up to the size of `buffer_`) and `next_` is
|
||||
// the position where the next new value is written in `buffer_`.
|
||||
std::array<float, kSaturationProtectorBufferSize> buffer_;
|
||||
int next_ = 0;
|
||||
int size_ = 0;
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_
|
||||
@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
|
||||
|
||||
#include "test/gmock.h"
|
||||
#include "test/gtest.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
using ::testing::Eq;
|
||||
using ::testing::Optional;
|
||||
|
||||
TEST(GainController2SaturationProtectorBuffer, Init) {
|
||||
SaturationProtectorBuffer b;
|
||||
EXPECT_EQ(b.Size(), 0);
|
||||
EXPECT_FALSE(b.Front().has_value());
|
||||
}
|
||||
|
||||
TEST(GainController2SaturationProtectorBuffer, PushBack) {
|
||||
SaturationProtectorBuffer b;
|
||||
constexpr float kValue = 123.0f;
|
||||
b.PushBack(kValue);
|
||||
EXPECT_EQ(b.Size(), 1);
|
||||
EXPECT_THAT(b.Front(), Optional(Eq(kValue)));
|
||||
}
|
||||
|
||||
TEST(GainController2SaturationProtectorBuffer, Reset) {
|
||||
SaturationProtectorBuffer b;
|
||||
b.PushBack(123.0f);
|
||||
b.Reset();
|
||||
EXPECT_EQ(b.Size(), 0);
|
||||
EXPECT_FALSE(b.Front().has_value());
|
||||
}
|
||||
|
||||
// Checks that the front value does not change until the ring buffer gets full.
|
||||
TEST(GainController2SaturationProtectorBuffer, FrontUntilBufferIsFull) {
|
||||
SaturationProtectorBuffer b;
|
||||
constexpr float kValue = 123.0f;
|
||||
b.PushBack(kValue);
|
||||
for (int i = 1; i < b.Capacity(); ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
EXPECT_THAT(b.Front(), Optional(Eq(kValue)));
|
||||
b.PushBack(kValue + i);
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that when the buffer is full it behaves as a shift register.
|
||||
TEST(GainController2SaturationProtectorBuffer, FrontIsDelayed) {
|
||||
SaturationProtectorBuffer b;
|
||||
// Fill the buffer.
|
||||
for (int i = 0; i < b.Capacity(); ++i) {
|
||||
b.PushBack(i);
|
||||
}
|
||||
// The ring buffer should now behave as a shift register with a delay equal to
|
||||
// its capacity.
|
||||
for (int i = b.Capacity(); i < 2 * b.Capacity() + 1; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
EXPECT_THAT(b.Front(), Optional(Eq(i - b.Capacity())));
|
||||
b.PushBack(i);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace webrtc
|
||||
@ -10,181 +10,166 @@
|
||||
|
||||
#include "modules/audio_processing/agc2/saturation_protector.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "modules/audio_processing/agc2/agc2_common.h"
|
||||
#include "modules/audio_processing/logging/apm_data_dumper.h"
|
||||
#include "rtc_base/gunit.h"
|
||||
#include "test/gmock.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
constexpr float kInitialMarginDb = 20.f;
|
||||
constexpr float kInitialHeadroomDb = 20.0f;
|
||||
constexpr float kNoExtraHeadroomDb = 0.0f;
|
||||
constexpr int kNoAdjacentSpeechFramesRequired = 1;
|
||||
constexpr float kMaxSpeechProbability = 1.0f;
|
||||
|
||||
using saturation_protector_impl::RingBuffer;
|
||||
|
||||
SaturationProtectorState CreateSaturationProtectorState() {
|
||||
SaturationProtectorState state;
|
||||
ResetSaturationProtectorState(kInitialMarginDb, state);
|
||||
return state;
|
||||
}
|
||||
|
||||
// Updates `state` for `num_iterations` times with constant speech level and
|
||||
// peak powers and returns the maximum margin.
|
||||
// Calls `Analyze(speech_probability, peak_dbfs, speech_level_dbfs)`
|
||||
// `num_iterations` times on `saturation_protector` and return the largest
|
||||
// headroom difference between two consecutive calls.
|
||||
float RunOnConstantLevel(int num_iterations,
|
||||
float speech_peak_dbfs,
|
||||
float speech_probability,
|
||||
float peak_dbfs,
|
||||
float speech_level_dbfs,
|
||||
SaturationProtectorState& state) {
|
||||
float last_margin = state.margin_db;
|
||||
float max_difference = 0.f;
|
||||
SaturationProtector& saturation_protector) {
|
||||
float last_headroom = saturation_protector.HeadroomDb();
|
||||
float max_difference = 0.0f;
|
||||
for (int i = 0; i < num_iterations; ++i) {
|
||||
UpdateSaturationProtectorState(speech_peak_dbfs, speech_level_dbfs, state);
|
||||
const float new_margin = state.margin_db;
|
||||
saturation_protector.Analyze(speech_probability, peak_dbfs,
|
||||
speech_level_dbfs);
|
||||
const float new_headroom = saturation_protector.HeadroomDb();
|
||||
max_difference =
|
||||
std::max(max_difference, std::abs(new_margin - last_margin));
|
||||
last_margin = new_margin;
|
||||
std::max(max_difference, std::fabs(new_headroom - last_headroom));
|
||||
last_headroom = new_headroom;
|
||||
}
|
||||
return max_difference;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(AutomaticGainController2SaturationProtector, RingBufferInit) {
|
||||
RingBuffer b;
|
||||
EXPECT_EQ(b.Size(), 0);
|
||||
EXPECT_FALSE(b.Front().has_value());
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2SaturationProtector, RingBufferPushBack) {
|
||||
RingBuffer b;
|
||||
constexpr float kValue = 123.f;
|
||||
b.PushBack(kValue);
|
||||
EXPECT_EQ(b.Size(), 1);
|
||||
ASSERT_TRUE(b.Front().has_value());
|
||||
EXPECT_EQ(b.Front().value(), kValue);
|
||||
}
|
||||
|
||||
TEST(AutomaticGainController2SaturationProtector, RingBufferReset) {
|
||||
RingBuffer b;
|
||||
b.PushBack(123.f);
|
||||
b.Reset();
|
||||
EXPECT_EQ(b.Size(), 0);
|
||||
EXPECT_FALSE(b.Front().has_value());
|
||||
}
|
||||
|
||||
// Checks that the front value does not change until the ring buffer gets full.
|
||||
TEST(AutomaticGainController2SaturationProtector,
|
||||
RingBufferFrontUntilBufferIsFull) {
|
||||
RingBuffer b;
|
||||
constexpr float kValue = 123.f;
|
||||
b.PushBack(kValue);
|
||||
for (int i = 1; i < b.Capacity(); ++i) {
|
||||
EXPECT_EQ(b.Front().value(), kValue);
|
||||
b.PushBack(kValue + i);
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that when the buffer is full it behaves as a shift register.
|
||||
TEST(AutomaticGainController2SaturationProtector,
|
||||
FullRingBufferFrontIsDelayed) {
|
||||
RingBuffer b;
|
||||
// Fill the buffer.
|
||||
for (int i = 0; i < b.Capacity(); ++i) {
|
||||
b.PushBack(i);
|
||||
}
|
||||
// The ring buffer should now behave as a shift register with a delay equal to
|
||||
// its capacity.
|
||||
for (int i = b.Capacity(); i < 2 * b.Capacity() + 1; ++i) {
|
||||
EXPECT_EQ(b.Front().value(), i - b.Capacity());
|
||||
b.PushBack(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that a state after reset equals a state after construction.
|
||||
TEST(AutomaticGainController2SaturationProtector, ResetState) {
|
||||
SaturationProtectorState init_state;
|
||||
ResetSaturationProtectorState(kInitialMarginDb, init_state);
|
||||
|
||||
SaturationProtectorState state;
|
||||
ResetSaturationProtectorState(kInitialMarginDb, state);
|
||||
RunOnConstantLevel(/*num_iterations=*/10, /*speech_level_dbfs=*/-20.f,
|
||||
/*speech_peak_dbfs=*/-10.f, state);
|
||||
ASSERT_NE(init_state, state); // Make sure that there are side-effects.
|
||||
ResetSaturationProtectorState(kInitialMarginDb, state);
|
||||
|
||||
EXPECT_EQ(init_state, state);
|
||||
// Checks that the returned headroom value is correctly reset.
|
||||
TEST(GainController2SaturationProtector, Reset) {
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
auto saturation_protector = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
|
||||
&apm_data_dumper);
|
||||
const float initial_headroom_db = saturation_protector->HeadroomDb();
|
||||
RunOnConstantLevel(/*num_iterations=*/10, kMaxSpeechProbability,
|
||||
/*peak_dbfs=*/0.0f,
|
||||
/*speech_level_dbfs=*/-10.0f, *saturation_protector);
|
||||
// Make sure that there are side-effects.
|
||||
ASSERT_NE(initial_headroom_db, saturation_protector->HeadroomDb());
|
||||
saturation_protector->Reset();
|
||||
EXPECT_EQ(initial_headroom_db, saturation_protector->HeadroomDb());
|
||||
}
|
||||
|
||||
// Checks that the estimate converges to the ratio between peaks and level
|
||||
// estimator values after a while.
|
||||
TEST(AutomaticGainController2SaturationProtector,
|
||||
ProtectorEstimatesCrestRatio) {
|
||||
TEST(GainController2SaturationProtector, EstimatesCrestRatio) {
|
||||
constexpr int kNumIterations = 2000;
|
||||
constexpr float kPeakLevel = -20.f;
|
||||
constexpr float kCrestFactor = kInitialMarginDb + 1.f;
|
||||
constexpr float kSpeechLevel = kPeakLevel - kCrestFactor;
|
||||
const float kMaxDifference = 0.5f * std::abs(kInitialMarginDb - kCrestFactor);
|
||||
constexpr float kPeakLevelDbfs = -20.0f;
|
||||
constexpr float kCrestFactorDb = kInitialHeadroomDb + 1.0f;
|
||||
constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - kCrestFactorDb;
|
||||
const float kMaxDifferenceDb =
|
||||
0.5f * std::fabs(kInitialHeadroomDb - kCrestFactorDb);
|
||||
|
||||
auto state = CreateSaturationProtectorState();
|
||||
RunOnConstantLevel(kNumIterations, kPeakLevel, kSpeechLevel, state);
|
||||
|
||||
EXPECT_NEAR(state.margin_db, kCrestFactor, kMaxDifference);
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
auto saturation_protector = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
|
||||
&apm_data_dumper);
|
||||
RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs,
|
||||
kSpeechLevelDbfs, *saturation_protector);
|
||||
EXPECT_NEAR(saturation_protector->HeadroomDb(), kCrestFactorDb,
|
||||
kMaxDifferenceDb);
|
||||
}
|
||||
|
||||
// Checks that the margin does not change too quickly.
|
||||
TEST(AutomaticGainController2SaturationProtector, ChangeSlowly) {
|
||||
// Checks that the extra headroom is applied.
|
||||
TEST(GainController2SaturationProtector, ExtraHeadroomApplied) {
|
||||
constexpr float kExtraHeadroomDb = 5.1234f;
|
||||
constexpr int kNumIterations = 10;
|
||||
constexpr float kPeakLevelDbfs = -20.0f;
|
||||
constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - 15.0f;
|
||||
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
|
||||
auto saturation_protector_no_extra = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
|
||||
&apm_data_dumper);
|
||||
for (int i = 0; i < kNumIterations; ++i) {
|
||||
saturation_protector_no_extra->Analyze(kMaxSpeechProbability,
|
||||
kPeakLevelDbfs, kSpeechLevelDbfs);
|
||||
}
|
||||
|
||||
auto saturation_protector_extra = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
|
||||
&apm_data_dumper);
|
||||
for (int i = 0; i < kNumIterations; ++i) {
|
||||
saturation_protector_extra->Analyze(kMaxSpeechProbability, kPeakLevelDbfs,
|
||||
kSpeechLevelDbfs);
|
||||
}
|
||||
|
||||
EXPECT_EQ(saturation_protector_no_extra->HeadroomDb() + kExtraHeadroomDb,
|
||||
saturation_protector_extra->HeadroomDb());
|
||||
}
|
||||
|
||||
// Checks that the headroom does not change too quickly.
|
||||
TEST(GainController2SaturationProtector, ChangeSlowly) {
|
||||
constexpr int kNumIterations = 1000;
|
||||
constexpr float kPeakLevel = -20.f;
|
||||
constexpr float kCrestFactor = kInitialMarginDb - 5.f;
|
||||
constexpr float kOtherCrestFactor = kInitialMarginDb;
|
||||
constexpr float kSpeechLevel = kPeakLevel - kCrestFactor;
|
||||
constexpr float kOtherSpeechLevel = kPeakLevel - kOtherCrestFactor;
|
||||
|
||||
auto state = CreateSaturationProtectorState();
|
||||
float max_difference =
|
||||
RunOnConstantLevel(kNumIterations, kPeakLevel, kSpeechLevel, state);
|
||||
max_difference = std::max(
|
||||
RunOnConstantLevel(kNumIterations, kPeakLevel, kOtherSpeechLevel, state),
|
||||
max_difference);
|
||||
constexpr float kPeakLevelDbfs = -20.f;
|
||||
constexpr float kCrestFactorDb = kInitialHeadroomDb - 5.f;
|
||||
constexpr float kOtherCrestFactorDb = kInitialHeadroomDb;
|
||||
constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - kCrestFactorDb;
|
||||
constexpr float kOtherSpeechLevelDbfs = kPeakLevelDbfs - kOtherCrestFactorDb;
|
||||
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
auto saturation_protector = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
|
||||
&apm_data_dumper);
|
||||
float max_difference_db =
|
||||
RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs,
|
||||
kSpeechLevelDbfs, *saturation_protector);
|
||||
max_difference_db = std::max(
|
||||
RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs,
|
||||
kOtherSpeechLevelDbfs, *saturation_protector),
|
||||
max_difference_db);
|
||||
constexpr float kMaxChangeSpeedDbPerSecond = 0.5f; // 1 db / 2 seconds.
|
||||
EXPECT_LE(max_difference,
|
||||
EXPECT_LE(max_difference_db,
|
||||
kMaxChangeSpeedDbPerSecond / 1000 * kFrameDurationMs);
|
||||
}
|
||||
|
||||
// Checks that there is a delay between input change and margin adaptations.
|
||||
TEST(AutomaticGainController2SaturationProtector, AdaptToDelayedChanges) {
|
||||
constexpr int kDelayIterations = kFullBufferSizeMs / kFrameDurationMs;
|
||||
constexpr float kInitialSpeechLevelDbfs = -30.f;
|
||||
constexpr float kLaterSpeechLevelDbfs = -15.f;
|
||||
class SaturationProtectorParametrization
|
||||
: public ::testing::TestWithParam<int> {
|
||||
protected:
|
||||
int adjacent_speech_frames_threshold() const { return GetParam(); }
|
||||
};
|
||||
|
||||
auto state = CreateSaturationProtectorState();
|
||||
// First run on initial level.
|
||||
float max_difference = RunOnConstantLevel(
|
||||
kDelayIterations, kInitialSpeechLevelDbfs + kInitialMarginDb,
|
||||
kInitialSpeechLevelDbfs, state);
|
||||
// Then peak changes, but not RMS.
|
||||
max_difference =
|
||||
std::max(RunOnConstantLevel(kDelayIterations,
|
||||
kLaterSpeechLevelDbfs + kInitialMarginDb,
|
||||
kInitialSpeechLevelDbfs, state),
|
||||
max_difference);
|
||||
// Then both change.
|
||||
max_difference =
|
||||
std::max(RunOnConstantLevel(kDelayIterations,
|
||||
kLaterSpeechLevelDbfs + kInitialMarginDb,
|
||||
kLaterSpeechLevelDbfs, state),
|
||||
max_difference);
|
||||
|
||||
// The saturation protector expects that the RMS changes roughly
|
||||
// 'kFullBufferSizeMs' after peaks change. This is to account for delay
|
||||
// introduced by the level estimator. Therefore, the input above is 'normal'
|
||||
// and 'expected', and shouldn't influence the margin by much.
|
||||
const float total_difference = std::abs(state.margin_db - kInitialMarginDb);
|
||||
|
||||
EXPECT_LE(total_difference, 0.05f);
|
||||
EXPECT_LE(max_difference, 0.01f);
|
||||
TEST_P(SaturationProtectorParametrization, DoNotAdaptToShortSpeechSegments) {
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
auto saturation_protector = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kNoExtraHeadroomDb,
|
||||
adjacent_speech_frames_threshold(), &apm_data_dumper);
|
||||
const float initial_headroom_db = saturation_protector->HeadroomDb();
|
||||
RunOnConstantLevel(/*num_iterations=*/adjacent_speech_frames_threshold() - 1,
|
||||
kMaxSpeechProbability,
|
||||
/*peak_dbfs=*/0.0f,
|
||||
/*speech_level_dbfs=*/-10.0f, *saturation_protector);
|
||||
// No adaptation expected.
|
||||
EXPECT_EQ(initial_headroom_db, saturation_protector->HeadroomDb());
|
||||
}
|
||||
|
||||
TEST_P(SaturationProtectorParametrization, AdaptToEnoughSpeechSegments) {
|
||||
ApmDataDumper apm_data_dumper(0);
|
||||
auto saturation_protector = CreateSaturationProtector(
|
||||
kInitialHeadroomDb, kNoExtraHeadroomDb,
|
||||
adjacent_speech_frames_threshold(), &apm_data_dumper);
|
||||
const float initial_headroom_db = saturation_protector->HeadroomDb();
|
||||
RunOnConstantLevel(/*num_iterations=*/adjacent_speech_frames_threshold() + 1,
|
||||
kMaxSpeechProbability,
|
||||
/*peak_dbfs=*/0.0f,
|
||||
/*speech_level_dbfs=*/-10.0f, *saturation_protector);
|
||||
// Adaptation expected.
|
||||
EXPECT_NE(initial_headroom_db, saturation_protector->HeadroomDb());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GainController2,
|
||||
SaturationProtectorParametrization,
|
||||
::testing::Values(2, 9, 17));
|
||||
|
||||
} // namespace
|
||||
} // namespace webrtc
|
||||
|
||||
@ -65,43 +65,23 @@ class Vad : public VoiceActivityDetector {
|
||||
rnn_vad::RnnVad rnn_vad_;
|
||||
};
|
||||
|
||||
// Returns an updated version of `p_old` by using instant decay and the given
|
||||
// `attack` on a new VAD probability value `p_new`.
|
||||
float SmoothedVadProbability(float p_old, float p_new, float attack) {
|
||||
RTC_DCHECK_GT(attack, 0.0f);
|
||||
RTC_DCHECK_LE(attack, 1.0f);
|
||||
if (p_new < p_old || attack == 1.0f) {
|
||||
// Instant decay (or no smoothing).
|
||||
return p_new;
|
||||
} else {
|
||||
// Attack phase.
|
||||
return attack * p_new + (1.0f - attack) * p_old;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
VadLevelAnalyzer::VadLevelAnalyzer()
|
||||
: VadLevelAnalyzer(kDefaultVadRnnResetPeriodMs,
|
||||
kDefaultSmoothedVadProbabilityAttack,
|
||||
GetAvailableCpuFeatures()) {}
|
||||
: VadLevelAnalyzer(kDefaultVadRnnResetPeriodMs, GetAvailableCpuFeatures()) {
|
||||
}
|
||||
|
||||
VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms,
|
||||
float vad_probability_attack,
|
||||
const AvailableCpuFeatures& cpu_features)
|
||||
: VadLevelAnalyzer(vad_reset_period_ms,
|
||||
vad_probability_attack,
|
||||
std::make_unique<Vad>(cpu_features)) {}
|
||||
|
||||
VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms,
|
||||
float vad_probability_attack,
|
||||
std::unique_ptr<VoiceActivityDetector> vad)
|
||||
: vad_(std::move(vad)),
|
||||
vad_reset_period_frames_(
|
||||
rtc::CheckedDivExact(vad_reset_period_ms, kFrameDurationMs)),
|
||||
vad_probability_attack_(vad_probability_attack),
|
||||
time_to_vad_reset_(vad_reset_period_frames_),
|
||||
vad_probability_(0.0f) {
|
||||
time_to_vad_reset_(vad_reset_period_frames_) {
|
||||
RTC_DCHECK(vad_);
|
||||
RTC_DCHECK_GT(vad_reset_period_frames_, 1);
|
||||
}
|
||||
@ -123,11 +103,7 @@ VadLevelAnalyzer::Result VadLevelAnalyzer::AnalyzeFrame(
|
||||
peak = std::max(std::fabs(x), peak);
|
||||
rms += x * x;
|
||||
}
|
||||
// Compute smoothed speech probability.
|
||||
vad_probability_ = SmoothedVadProbability(
|
||||
/*p_old=*/vad_probability_, /*p_new=*/vad_->ComputeProbability(frame),
|
||||
vad_probability_attack_);
|
||||
return {vad_probability_,
|
||||
return {vad_->ComputeProbability(frame),
|
||||
FloatS16ToDbfs(std::sqrt(rms / frame.samples_per_channel())),
|
||||
FloatS16ToDbfs(peak)};
|
||||
}
|
||||
|
||||
@ -37,18 +37,15 @@ class VadLevelAnalyzer {
|
||||
virtual float ComputeProbability(AudioFrameView<const float> frame) = 0;
|
||||
};
|
||||
|
||||
// Ctor. Uses the default VAD.
|
||||
// Ctor. Uses the default VAD with the default settings.
|
||||
VadLevelAnalyzer();
|
||||
// Ctor. `vad_reset_period_ms` indicates the period in milliseconds to call
|
||||
// `VadLevelAnalyzer::Reset()`; it must be equal to or greater than the
|
||||
// duration of two frames. `vad_probability_attack` is a number in (0,1] used
|
||||
// to smooth the speech probability (instant decay, slow attack).
|
||||
// duration of two frames. Uses `cpu_features` to instantiate the default VAD.
|
||||
VadLevelAnalyzer(int vad_reset_period_ms,
|
||||
float vad_probability_attack,
|
||||
const AvailableCpuFeatures& cpu_features);
|
||||
// Ctor. Uses a custom `vad`.
|
||||
VadLevelAnalyzer(int vad_reset_period_ms,
|
||||
float vad_probability_attack,
|
||||
std::unique_ptr<VoiceActivityDetector> vad);
|
||||
|
||||
VadLevelAnalyzer(const VadLevelAnalyzer&) = delete;
|
||||
@ -61,9 +58,7 @@ class VadLevelAnalyzer {
|
||||
private:
|
||||
std::unique_ptr<VoiceActivityDetector> vad_;
|
||||
const int vad_reset_period_frames_;
|
||||
const float vad_probability_attack_;
|
||||
int time_to_vad_reset_;
|
||||
float vad_probability_;
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -29,9 +29,6 @@ using ::testing::ReturnRoundRobin;
|
||||
constexpr int kNoVadPeriodicReset =
|
||||
kFrameDurationMs * (std::numeric_limits<int>::max() / kFrameDurationMs);
|
||||
|
||||
constexpr float kInstantAttack = 1.0f;
|
||||
constexpr float kSlowAttack = 0.1f;
|
||||
|
||||
constexpr int kSampleRateHz = 8000;
|
||||
|
||||
class MockVad : public VadLevelAnalyzer::VoiceActivityDetector {
|
||||
@ -48,7 +45,6 @@ class MockVad : public VadLevelAnalyzer::VoiceActivityDetector {
|
||||
// restart from the beginning.
|
||||
std::unique_ptr<VadLevelAnalyzer> CreateVadLevelAnalyzerWithMockVad(
|
||||
int vad_reset_period_ms,
|
||||
float vad_probability_attack,
|
||||
const std::vector<float>& speech_probabilities,
|
||||
int expected_vad_reset_calls = 0) {
|
||||
auto vad = std::make_unique<MockVad>();
|
||||
@ -58,8 +54,8 @@ std::unique_ptr<VadLevelAnalyzer> CreateVadLevelAnalyzerWithMockVad(
|
||||
if (expected_vad_reset_calls >= 0) {
|
||||
EXPECT_CALL(*vad, Reset).Times(expected_vad_reset_calls);
|
||||
}
|
||||
return std::make_unique<VadLevelAnalyzer>(
|
||||
vad_reset_period_ms, vad_probability_attack, std::move(vad));
|
||||
return std::make_unique<VadLevelAnalyzer>(vad_reset_period_ms,
|
||||
std::move(vad));
|
||||
}
|
||||
|
||||
// 10 ms mono frame.
|
||||
@ -75,7 +71,7 @@ struct FrameWithView {
|
||||
const AudioFrameView<const float> view;
|
||||
};
|
||||
|
||||
TEST(AutomaticGainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) {
|
||||
TEST(GainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) {
|
||||
// Handcrafted frame so that the average is lower than the peak value.
|
||||
FrameWithView frame(1000.0f); // Constant frame.
|
||||
frame.samples[10] = 2000.0f; // Except for one peak value.
|
||||
@ -88,14 +84,13 @@ TEST(AutomaticGainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) {
|
||||
EXPECT_LT(levels_and_vad_prob.rms_dbfs, levels_and_vad_prob.peak_dbfs);
|
||||
}
|
||||
|
||||
// Checks that the unprocessed and the smoothed speech probabilities match when
|
||||
// instant attack is used.
|
||||
TEST(AutomaticGainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) {
|
||||
// Checks that the expect VAD probabilities are returned.
|
||||
TEST(GainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) {
|
||||
const std::vector<float> speech_probabilities{0.709f, 0.484f, 0.882f, 0.167f,
|
||||
0.44f, 0.525f, 0.858f, 0.314f,
|
||||
0.653f, 0.965f, 0.413f, 0.0f};
|
||||
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
|
||||
kNoVadPeriodicReset, kInstantAttack, speech_probabilities);
|
||||
auto analyzer = CreateVadLevelAnalyzerWithMockVad(kNoVadPeriodicReset,
|
||||
speech_probabilities);
|
||||
FrameWithView frame;
|
||||
for (int i = 0; rtc::SafeLt(i, speech_probabilities.size()); ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
@ -104,45 +99,11 @@ TEST(AutomaticGainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) {
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that the smoothed speech probability does not instantly converge to
|
||||
// the unprocessed one when slow attack is used.
|
||||
TEST(AutomaticGainController2VadLevelAnalyzer,
|
||||
SlowAttackSpeechProbabilitySmoothing) {
|
||||
const std::vector<float> speech_probabilities{0.0f, 0.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f};
|
||||
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
|
||||
kNoVadPeriodicReset, kSlowAttack, speech_probabilities);
|
||||
FrameWithView frame;
|
||||
float prev_probability = 0.0f;
|
||||
for (int i = 0; rtc::SafeLt(i, speech_probabilities.size()); ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
const float smoothed_probability =
|
||||
analyzer->AnalyzeFrame(frame.view).speech_probability;
|
||||
EXPECT_LT(smoothed_probability, 1.0f); // Not enough time to reach 1.
|
||||
EXPECT_LE(prev_probability, smoothed_probability); // Converge towards 1.
|
||||
prev_probability = smoothed_probability;
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that the smoothed speech probability instantly decays to the
|
||||
// unprocessed one when slow attack is used.
|
||||
TEST(AutomaticGainController2VadLevelAnalyzer, SpeechProbabilityInstantDecay) {
|
||||
const std::vector<float> speech_probabilities{1.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, 0.0f};
|
||||
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
|
||||
kNoVadPeriodicReset, kSlowAttack, speech_probabilities);
|
||||
FrameWithView frame;
|
||||
for (int i = 0; rtc::SafeLt(i, speech_probabilities.size() - 1); ++i) {
|
||||
analyzer->AnalyzeFrame(frame.view);
|
||||
}
|
||||
EXPECT_EQ(0.0f, analyzer->AnalyzeFrame(frame.view).speech_probability);
|
||||
}
|
||||
|
||||
// Checks that the VAD is not periodically reset.
|
||||
TEST(AutomaticGainController2VadLevelAnalyzer, VadNoPeriodicReset) {
|
||||
TEST(GainController2VadLevelAnalyzer, VadNoPeriodicReset) {
|
||||
constexpr int kNumFrames = 19;
|
||||
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
|
||||
kNoVadPeriodicReset, kSlowAttack, /*speech_probabilities=*/{1.0f},
|
||||
kNoVadPeriodicReset, /*speech_probabilities=*/{1.0f},
|
||||
/*expected_vad_reset_calls=*/0);
|
||||
FrameWithView frame;
|
||||
for (int i = 0; i < kNumFrames; ++i) {
|
||||
@ -161,7 +122,7 @@ class VadPeriodResetParametrization
|
||||
TEST_P(VadPeriodResetParametrization, VadPeriodicReset) {
|
||||
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
|
||||
/*vad_reset_period_ms=*/vad_reset_period_frames() * kFrameDurationMs,
|
||||
kSlowAttack, /*speech_probabilities=*/{1.0f},
|
||||
/*speech_probabilities=*/{1.0f},
|
||||
/*expected_vad_reset_calls=*/num_frames() / vad_reset_period_frames());
|
||||
FrameWithView frame;
|
||||
for (int i = 0; i < num_frames(); ++i) {
|
||||
@ -169,7 +130,7 @@ TEST_P(VadPeriodResetParametrization, VadPeriodicReset) {
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AutomaticGainController2VadLevelAnalyzer,
|
||||
INSTANTIATE_TEST_SUITE_P(GainController2VadLevelAnalyzer,
|
||||
VadPeriodResetParametrization,
|
||||
::testing::Combine(::testing::Values(1, 19, 123),
|
||||
::testing::Values(2, 5, 20, 53)));
|
||||
|
||||
@ -73,7 +73,7 @@ void GainController2::Process(AudioBuffer* audio) {
|
||||
|
||||
void GainController2::NotifyAnalogLevel(int level) {
|
||||
if (analog_level_ != level && adaptive_agc_) {
|
||||
adaptive_agc_->Reset();
|
||||
adaptive_agc_->HandleInputGainChange();
|
||||
}
|
||||
analog_level_ = level;
|
||||
}
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#include "modules/audio_processing/gain_controller2.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
|
||||
#include "api/array_view.h"
|
||||
@ -68,7 +69,8 @@ std::unique_ptr<GainController2> CreateAgc2FixedDigitalMode(
|
||||
return agc2;
|
||||
}
|
||||
|
||||
float GainAfterProcessingFile(GainController2* gain_controller) {
|
||||
float GainDbAfterProcessingFile(GainController2& gain_controller,
|
||||
int max_duration_ms) {
|
||||
// Set up an AudioBuffer to be filled from the speech file.
|
||||
constexpr size_t kStereo = 2u;
|
||||
const StreamConfig capture_config(AudioProcessing::kSampleRate48kHz, kStereo,
|
||||
@ -82,24 +84,29 @@ float GainAfterProcessingFile(GainController2* gain_controller) {
|
||||
std::vector<float> capture_input(capture_config.num_frames() *
|
||||
capture_config.num_channels());
|
||||
|
||||
// The file should contain at least this many frames. Every iteration, we put
|
||||
// a frame through the gain controller.
|
||||
const int kNumFramesToProcess = 100;
|
||||
for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
|
||||
// Process the input file which must be long enough to cover
|
||||
// `max_duration_ms`.
|
||||
RTC_DCHECK_GT(max_duration_ms, 0);
|
||||
const int num_frames = rtc::CheckedDivExact(max_duration_ms, 10);
|
||||
for (int i = 0; i < num_frames; ++i) {
|
||||
ReadFloatSamplesFromStereoFile(capture_config.num_frames(),
|
||||
capture_config.num_channels(), &capture_file,
|
||||
capture_input);
|
||||
|
||||
test::CopyVectorToAudioBuffer(capture_config, capture_input, &ab);
|
||||
gain_controller->Process(&ab);
|
||||
gain_controller.Process(&ab);
|
||||
}
|
||||
|
||||
// Send in a last frame with values constant 1 (It's low enough to detect high
|
||||
// gain, and for ease of computation). The applied gain is the result.
|
||||
// Send in a last frame with minimum dBFS level.
|
||||
constexpr float sample_value = 1.f;
|
||||
SetAudioBufferSamples(sample_value, &ab);
|
||||
gain_controller->Process(&ab);
|
||||
return ab.channels()[0][0];
|
||||
gain_controller.Process(&ab);
|
||||
// Measure the RMS level after processing.
|
||||
float rms = 0.0f;
|
||||
for (size_t i = 0; i < capture_config.num_frames(); ++i) {
|
||||
rms += ab.channels()[0][i] * ab.channels()[0][i];
|
||||
}
|
||||
// Return the applied gain in dB.
|
||||
return 20.0f * std::log10(std::sqrt(rms / capture_config.num_frames()));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -324,34 +331,20 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
48000,
|
||||
true)));
|
||||
|
||||
TEST(GainController2, UsageSaturationMargin) {
|
||||
// Checks that the gain applied at the end of a PCM samples file is close to the
|
||||
// expected value.
|
||||
TEST(GainController2, CheckGainAdaptiveDigital) {
|
||||
constexpr float kExpectedGainDb = 4.3f;
|
||||
constexpr float kToleranceDb = 0.5f;
|
||||
GainController2 gain_controller2;
|
||||
gain_controller2.Initialize(AudioProcessing::kSampleRate48kHz);
|
||||
|
||||
AudioProcessing::Config::GainController2 config;
|
||||
// Check that samples are not amplified as much when extra margin is
|
||||
// high. They should not be amplified at all, but only after convergence. GC2
|
||||
// starts with a gain, and it takes time until it's down to 0 dB.
|
||||
config.fixed_digital.gain_db = 0.f;
|
||||
config.adaptive_digital.enabled = true;
|
||||
config.adaptive_digital.extra_saturation_margin_db = 50.f;
|
||||
gain_controller2.ApplyConfig(config);
|
||||
|
||||
EXPECT_LT(GainAfterProcessingFile(&gain_controller2), 2.f);
|
||||
}
|
||||
|
||||
TEST(GainController2, UsageNoSaturationMargin) {
|
||||
GainController2 gain_controller2;
|
||||
gain_controller2.Initialize(AudioProcessing::kSampleRate48kHz);
|
||||
|
||||
AudioProcessing::Config::GainController2 config;
|
||||
// Check that some gain is applied if there is no margin.
|
||||
config.fixed_digital.gain_db = 0.f;
|
||||
config.adaptive_digital.enabled = true;
|
||||
config.adaptive_digital.extra_saturation_margin_db = 0.f;
|
||||
gain_controller2.ApplyConfig(config);
|
||||
|
||||
EXPECT_GT(GainAfterProcessingFile(&gain_controller2), 1.9f);
|
||||
EXPECT_NEAR(
|
||||
GainDbAfterProcessingFile(gain_controller2, /*max_duration_ms=*/2000),
|
||||
kExpectedGainDb, kToleranceDb);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
|
||||
@ -46,17 +46,6 @@ std::string GainController1ModeToString(const Agc1Config::Mode& mode) {
|
||||
RTC_CHECK_NOTREACHED();
|
||||
}
|
||||
|
||||
std::string GainController2LevelEstimatorToString(
|
||||
const Agc2Config::LevelEstimator& level) {
|
||||
switch (level) {
|
||||
case Agc2Config::LevelEstimator::kRms:
|
||||
return "Rms";
|
||||
case Agc2Config::LevelEstimator::kPeak:
|
||||
return "Peak";
|
||||
}
|
||||
RTC_CHECK_NOTREACHED();
|
||||
}
|
||||
|
||||
std::string GainController2NoiseEstimatorToString(
|
||||
const Agc2Config::NoiseEstimator& type) {
|
||||
switch (type) {
|
||||
@ -174,20 +163,10 @@ std::string AudioProcessing::Config::ToString() const {
|
||||
<< gain_controller2.adaptive_digital.enabled << ", noise_estimator: "
|
||||
<< GainController2NoiseEstimatorToString(
|
||||
gain_controller2.adaptive_digital.noise_estimator)
|
||||
<< ", level_estimator: { vad_probability_attack: "
|
||||
<< gain_controller2.adaptive_digital.vad_probability_attack << ", type: "
|
||||
<< GainController2LevelEstimatorToString(
|
||||
gain_controller2.adaptive_digital.level_estimator)
|
||||
<< ", vad_reset_period_ms: "
|
||||
<< gain_controller2.adaptive_digital.vad_reset_period_ms
|
||||
<< ", adjacent_speech_frames_threshold: "
|
||||
<< gain_controller2.adaptive_digital
|
||||
.level_estimator_adjacent_speech_frames_threshold
|
||||
<< ", initial_saturation_margin_db: "
|
||||
<< gain_controller2.adaptive_digital.initial_saturation_margin_db
|
||||
<< ", extra_saturation_margin_db: "
|
||||
<< gain_controller2.adaptive_digital.extra_saturation_margin_db
|
||||
<< " }, gain_applier: { adjacent_speech_frames_threshold: "
|
||||
<< gain_controller2.adaptive_digital
|
||||
.gain_applier_adjacent_speech_frames_threshold
|
||||
<< gain_controller2.adaptive_digital.adjacent_speech_frames_threshold
|
||||
<< ", max_gain_change_db_per_second: "
|
||||
<< gain_controller2.adaptive_digital.max_gain_change_db_per_second
|
||||
<< ", max_output_noise_level_dbfs: "
|
||||
@ -195,7 +174,7 @@ std::string AudioProcessing::Config::ToString() const {
|
||||
<< ", sse2_allowed: " << gain_controller2.adaptive_digital.sse2_allowed
|
||||
<< ", avx2_allowed: " << gain_controller2.adaptive_digital.avx2_allowed
|
||||
<< ", neon_allowed: " << gain_controller2.adaptive_digital.neon_allowed
|
||||
<< " }}}, residual_echo_detector: { enabled: "
|
||||
<< "}}, residual_echo_detector: { enabled: "
|
||||
<< residual_echo_detector.enabled
|
||||
<< " }, level_estimation: { enabled: " << level_estimation.enabled
|
||||
<< " }}";
|
||||
|
||||
@ -349,6 +349,7 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
|
||||
return !(*this == rhs);
|
||||
}
|
||||
|
||||
// TODO(crbug.com/webrtc/7494): Remove `LevelEstimator`.
|
||||
enum LevelEstimator { kRms, kPeak };
|
||||
enum NoiseEstimator { kStationaryNoise, kNoiseFloor };
|
||||
bool enabled = false;
|
||||
@ -359,19 +360,20 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
|
||||
bool enabled = false;
|
||||
NoiseEstimator noise_estimator = kNoiseFloor;
|
||||
int vad_reset_period_ms = 1500;
|
||||
float vad_probability_attack = 0.9f;
|
||||
LevelEstimator level_estimator = kRms;
|
||||
int level_estimator_adjacent_speech_frames_threshold = 11;
|
||||
// TODO(crbug.com/webrtc/7494): Remove `use_saturation_protector`.
|
||||
bool use_saturation_protector = true;
|
||||
float initial_saturation_margin_db = 20.0f;
|
||||
float extra_saturation_margin_db = 5.0f;
|
||||
int gain_applier_adjacent_speech_frames_threshold = 11;
|
||||
int adjacent_speech_frames_threshold = 12;
|
||||
float max_gain_change_db_per_second = 3.0f;
|
||||
float max_output_noise_level_dbfs = -55.0f;
|
||||
float max_output_noise_level_dbfs = -50.0f;
|
||||
bool sse2_allowed = true;
|
||||
bool avx2_allowed = true;
|
||||
bool neon_allowed = true;
|
||||
// TODO(crbug.com/webrtc/7494): Remove deprecated settings below.
|
||||
float vad_probability_attack = 1.0f;
|
||||
LevelEstimator level_estimator = kRms;
|
||||
int level_estimator_adjacent_speech_frames_threshold = 12;
|
||||
bool use_saturation_protector = true;
|
||||
float initial_saturation_margin_db = 25.0f;
|
||||
float extra_saturation_margin_db = 5.0f;
|
||||
int gain_applier_adjacent_speech_frames_threshold = 12;
|
||||
} adaptive_digital;
|
||||
} gain_controller2;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user