Use VAD to get a better speech power estimation in the IntelligibilityEnhancer
R=henrik.lundin@webrtc.org, turaj@webrtc.org Review URL: https://codereview.webrtc.org/1693823004 . Cr-Commit-Position: refs/heads/master@{#11713}
This commit is contained in:
@ -1184,8 +1184,7 @@ bool AudioProcessingImpl::analysis_needed(bool is_data_processed) const {
|
||||
}
|
||||
|
||||
bool AudioProcessingImpl::is_rev_processed() const {
|
||||
return constants_.intelligibility_enabled &&
|
||||
public_submodules_->intelligibility_enhancer->active();
|
||||
return constants_.intelligibility_enabled;
|
||||
}
|
||||
|
||||
bool AudioProcessingImpl::render_check_rev_conversion_needed() const {
|
||||
@ -1236,12 +1235,9 @@ void AudioProcessingImpl::InitializeBeamformer() {
|
||||
|
||||
void AudioProcessingImpl::InitializeIntelligibility() {
|
||||
if (constants_.intelligibility_enabled) {
|
||||
IntelligibilityEnhancer::Config config;
|
||||
config.sample_rate_hz = capture_nonlocked_.split_rate;
|
||||
config.num_capture_channels = capture_.capture_audio->num_channels();
|
||||
config.num_render_channels = render_.render_audio->num_channels();
|
||||
public_submodules_->intelligibility_enhancer.reset(
|
||||
new IntelligibilityEnhancer(config));
|
||||
new IntelligibilityEnhancer(capture_nonlocked_.split_rate,
|
||||
render_.render_audio->num_channels()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -27,11 +27,16 @@ namespace {
|
||||
const size_t kErbResolution = 2;
|
||||
const int kWindowSizeMs = 16;
|
||||
const int kChunkSizeMs = 10; // Size provided by APM.
|
||||
const float kClipFreq = 200.0f;
|
||||
const float kConfigRho = 0.02f; // Default production and interpretation SNR.
|
||||
const float kClipFreqKhz = 0.2f;
|
||||
const float kKbdAlpha = 1.5f;
|
||||
const float kLambdaBot = -1.0f; // Extreme values in bisection
|
||||
const float kLambdaTop = -10e-18f; // search for lamda.
|
||||
const float kVoiceProbabilityThreshold = 0.02f;
|
||||
// Number of chunks after voice activity which is still considered speech.
|
||||
const size_t kSpeechOffsetDelay = 80;
|
||||
const float kDecayRate = 0.98f; // Power estimation decay rate.
|
||||
const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.
|
||||
const float kRho = 0.0004f; // Default production and interpretation SNR.
|
||||
|
||||
// Returns dot product of vectors |a| and |b| with size |length|.
|
||||
float DotProduct(const float* a, const float* b, size_t length) {
|
||||
@ -72,61 +77,46 @@ void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
|
||||
}
|
||||
}
|
||||
|
||||
IntelligibilityEnhancer::IntelligibilityEnhancer()
|
||||
: IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {
|
||||
}
|
||||
|
||||
IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
|
||||
IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
|
||||
size_t num_render_channels)
|
||||
: freqs_(RealFourier::ComplexLength(
|
||||
RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),
|
||||
window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))),
|
||||
chunk_length_(
|
||||
static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)),
|
||||
bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),
|
||||
sample_rate_hz_(config.sample_rate_hz),
|
||||
erb_resolution_(kErbResolution),
|
||||
num_capture_channels_(config.num_capture_channels),
|
||||
num_render_channels_(config.num_render_channels),
|
||||
analysis_rate_(config.analysis_rate),
|
||||
active_(true),
|
||||
clear_power_(freqs_, config.decay_rate),
|
||||
noise_power_(freqs_, 0.f),
|
||||
RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
|
||||
chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
|
||||
bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
|
||||
sample_rate_hz_(sample_rate_hz),
|
||||
num_render_channels_(num_render_channels),
|
||||
clear_power_estimator_(freqs_, kDecayRate),
|
||||
noise_power_estimator_(
|
||||
new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
|
||||
filtered_clear_pow_(new float[bank_size_]),
|
||||
filtered_noise_pow_(new float[bank_size_]),
|
||||
center_freqs_(new float[bank_size_]),
|
||||
render_filter_bank_(CreateErbBank(freqs_)),
|
||||
rho_(new float[bank_size_]),
|
||||
gains_eq_(new float[bank_size_]),
|
||||
gain_applier_(freqs_, config.gain_change_limit),
|
||||
gain_applier_(freqs_, kMaxRelativeGainChange),
|
||||
temp_render_out_buffer_(chunk_length_, num_render_channels_),
|
||||
kbd_window_(new float[window_size_]),
|
||||
render_callback_(this),
|
||||
block_count_(0),
|
||||
analysis_step_(0) {
|
||||
RTC_DCHECK_LE(config.rho, 1.0f);
|
||||
audio_s16_(chunk_length_),
|
||||
chunks_since_voice_(kSpeechOffsetDelay),
|
||||
is_speech_(false) {
|
||||
RTC_DCHECK_LE(kRho, 1.f);
|
||||
|
||||
memset(filtered_clear_pow_.get(),
|
||||
0,
|
||||
memset(filtered_clear_pow_.get(), 0,
|
||||
bank_size_ * sizeof(filtered_clear_pow_[0]));
|
||||
memset(filtered_noise_pow_.get(),
|
||||
0,
|
||||
memset(filtered_noise_pow_.get(), 0,
|
||||
bank_size_ * sizeof(filtered_noise_pow_[0]));
|
||||
|
||||
// Assumes all rho equal.
|
||||
for (size_t i = 0; i < bank_size_; ++i) {
|
||||
rho_[i] = config.rho * config.rho;
|
||||
}
|
||||
const size_t erb_index = static_cast<size_t>(
|
||||
ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
|
||||
43.f));
|
||||
start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
|
||||
|
||||
float freqs_khz = kClipFreq / 1000.0f;
|
||||
size_t erb_index = static_cast<size_t>(ceilf(
|
||||
11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));
|
||||
start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);
|
||||
|
||||
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
|
||||
kbd_window_.get());
|
||||
size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
|
||||
std::vector<float> kbd_window(window_size);
|
||||
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
|
||||
render_mangler_.reset(new LappedTransform(
|
||||
num_render_channels_, num_render_channels_, chunk_length_,
|
||||
kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));
|
||||
num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
|
||||
window_size, window_size / 2, &render_callback_));
|
||||
}
|
||||
|
||||
void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
|
||||
@ -134,13 +124,10 @@ void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
|
||||
if (capture_filter_bank_.size() != bank_size_ ||
|
||||
capture_filter_bank_[0].size() != noise.size()) {
|
||||
capture_filter_bank_ = CreateErbBank(noise.size());
|
||||
noise_power_estimator_.reset(
|
||||
new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
|
||||
}
|
||||
if (noise.size() != noise_power_.size()) {
|
||||
noise_power_.resize(noise.size());
|
||||
}
|
||||
for (size_t i = 0; i < noise.size(); ++i) {
|
||||
noise_power_[i] = noise[i] * noise[i];
|
||||
}
|
||||
noise_power_estimator_->Step(&noise[0]);
|
||||
}
|
||||
|
||||
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
|
||||
@ -148,54 +135,29 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
|
||||
size_t num_channels) {
|
||||
RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
|
||||
RTC_CHECK_EQ(num_render_channels_, num_channels);
|
||||
|
||||
if (active_) {
|
||||
is_speech_ = IsSpeech(audio[0]);
|
||||
render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
|
||||
}
|
||||
|
||||
if (active_) {
|
||||
for (size_t i = 0; i < num_render_channels_; ++i) {
|
||||
memcpy(audio[i], temp_render_out_buffer_.channels()[i],
|
||||
chunk_length_ * sizeof(**audio));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void IntelligibilityEnhancer::ProcessClearBlock(
|
||||
const std::complex<float>* in_block,
|
||||
std::complex<float>* out_block) {
|
||||
if (block_count_ < 2) {
|
||||
memset(out_block, 0, freqs_ * sizeof(*out_block));
|
||||
++block_count_;
|
||||
return;
|
||||
if (is_speech_) {
|
||||
clear_power_estimator_.Step(in_block);
|
||||
}
|
||||
|
||||
// TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary.
|
||||
if (true) {
|
||||
clear_power_.Step(in_block);
|
||||
if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
|
||||
AnalyzeClearBlock();
|
||||
++analysis_step_;
|
||||
}
|
||||
++block_count_;
|
||||
}
|
||||
|
||||
if (active_) {
|
||||
gain_applier_.Apply(in_block, out_block);
|
||||
}
|
||||
}
|
||||
|
||||
void IntelligibilityEnhancer::AnalyzeClearBlock() {
|
||||
const float* clear_power = clear_power_.Power();
|
||||
MapToErbBands(clear_power,
|
||||
render_filter_bank_,
|
||||
const std::vector<float>& clear_power = clear_power_estimator_.power();
|
||||
const std::vector<float>& noise_power = noise_power_estimator_->power();
|
||||
MapToErbBands(&clear_power[0], render_filter_bank_,
|
||||
filtered_clear_pow_.get());
|
||||
MapToErbBands(&noise_power_[0],
|
||||
capture_filter_bank_,
|
||||
MapToErbBands(&noise_power[0], capture_filter_bank_,
|
||||
filtered_noise_pow_.get());
|
||||
SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
|
||||
const float power_target = std::accumulate(
|
||||
clear_power, clear_power + freqs_, 0.f);
|
||||
const float power_target =
|
||||
std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
|
||||
const float power_top =
|
||||
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
|
||||
SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
|
||||
@ -205,6 +167,7 @@ void IntelligibilityEnhancer::AnalyzeClearBlock() {
|
||||
SolveForLambda(power_target, power_bot, power_top);
|
||||
UpdateErbGains();
|
||||
} // Else experiencing power underflow, so do nothing.
|
||||
gain_applier_.Apply(in_block, out_block);
|
||||
}
|
||||
|
||||
void IntelligibilityEnhancer::SolveForLambda(float power_target,
|
||||
@ -217,11 +180,10 @@ void IntelligibilityEnhancer::SolveForLambda(float power_target,
|
||||
1.f / (power_target + std::numeric_limits<float>::epsilon());
|
||||
float lambda_bot = kLambdaBot;
|
||||
float lambda_top = kLambdaTop;
|
||||
float power_ratio = 2.0f; // Ratio of achieved power to target power.
|
||||
float power_ratio = 2.f; // Ratio of achieved power to target power.
|
||||
int iters = 0;
|
||||
while (std::fabs(power_ratio - 1.0f) > kConvergeThresh &&
|
||||
iters <= kMaxIters) {
|
||||
const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;
|
||||
while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
|
||||
const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;
|
||||
SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
|
||||
const float power =
|
||||
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
|
||||
@ -239,7 +201,7 @@ void IntelligibilityEnhancer::UpdateErbGains() {
|
||||
// (ERB gain) = filterbank' * (freq gain)
|
||||
float* gains = gain_applier_.target();
|
||||
for (size_t i = 0; i < freqs_; ++i) {
|
||||
gains[i] = 0.0f;
|
||||
gains[i] = 0.f;
|
||||
for (size_t j = 0; j < bank_size_; ++j) {
|
||||
gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);
|
||||
}
|
||||
@ -248,9 +210,9 @@ void IntelligibilityEnhancer::UpdateErbGains() {
|
||||
|
||||
size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,
|
||||
size_t erb_resolution) {
|
||||
float freq_limit = sample_rate / 2000.0f;
|
||||
float freq_limit = sample_rate / 2000.f;
|
||||
size_t erb_scale = static_cast<size_t>(ceilf(
|
||||
11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f));
|
||||
11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));
|
||||
return erb_scale * erb_resolution;
|
||||
}
|
||||
|
||||
@ -260,7 +222,7 @@ std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
|
||||
size_t lf = 1, rf = 4;
|
||||
|
||||
for (size_t i = 0; i < bank_size_; ++i) {
|
||||
float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));
|
||||
float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));
|
||||
center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));
|
||||
center_freqs_[i] -= 14678.49f;
|
||||
}
|
||||
@ -274,48 +236,43 @@ std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
|
||||
}
|
||||
|
||||
for (size_t i = 1; i <= bank_size_; ++i) {
|
||||
size_t lll, ll, rr, rrr;
|
||||
static const size_t kOne = 1; // Avoids repeated static_cast<>s below.
|
||||
lll = static_cast<size_t>(round(
|
||||
center_freqs_[std::max(kOne, i - lf) - 1] * num_freqs /
|
||||
(0.5f * sample_rate_hz_)));
|
||||
ll = static_cast<size_t>(round(
|
||||
center_freqs_[std::max(kOne, i) - 1] * num_freqs /
|
||||
(0.5f * sample_rate_hz_)));
|
||||
size_t lll =
|
||||
static_cast<size_t>(round(center_freqs_[std::max(kOne, i - lf) - 1] *
|
||||
num_freqs / (0.5f * sample_rate_hz_)));
|
||||
size_t ll = static_cast<size_t>(round(center_freqs_[std::max(kOne, i) - 1] *
|
||||
num_freqs / (0.5f * sample_rate_hz_)));
|
||||
lll = std::min(num_freqs, std::max(lll, kOne)) - 1;
|
||||
ll = std::min(num_freqs, std::max(ll, kOne)) - 1;
|
||||
|
||||
rrr = static_cast<size_t>(round(
|
||||
center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
|
||||
size_t rrr = static_cast<size_t>(
|
||||
round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
|
||||
(0.5f * sample_rate_hz_)));
|
||||
rr = static_cast<size_t>(round(
|
||||
center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
|
||||
size_t rr = static_cast<size_t>(
|
||||
round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
|
||||
(0.5f * sample_rate_hz_)));
|
||||
rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1;
|
||||
rr = std::min(num_freqs, std::max(rr, kOne)) - 1;
|
||||
|
||||
float step, element;
|
||||
|
||||
step = ll == lll ? 0.f : 1.f / (ll - lll);
|
||||
element = 0.0f;
|
||||
float step = ll == lll ? 0.f : 1.f / (ll - lll);
|
||||
float element = 0.f;
|
||||
for (size_t j = lll; j <= ll; ++j) {
|
||||
filter_bank[i - 1][j] = element;
|
||||
element += step;
|
||||
}
|
||||
step = rr == rrr ? 0.f : 1.f / (rrr - rr);
|
||||
element = 1.0f;
|
||||
element = 1.f;
|
||||
for (size_t j = rr; j <= rrr; ++j) {
|
||||
filter_bank[i - 1][j] = element;
|
||||
element -= step;
|
||||
}
|
||||
for (size_t j = ll; j <= rr; ++j) {
|
||||
filter_bank[i - 1][j] = 1.0f;
|
||||
filter_bank[i - 1][j] = 1.f;
|
||||
}
|
||||
}
|
||||
|
||||
float sum;
|
||||
for (size_t i = 0; i < num_freqs; ++i) {
|
||||
sum = 0.0f;
|
||||
float sum = 0.f;
|
||||
for (size_t j = 0; j < bank_size_; ++j) {
|
||||
sum += filter_bank[j][i];
|
||||
}
|
||||
@ -329,22 +286,22 @@ std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
|
||||
void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
|
||||
size_t start_freq,
|
||||
float* sols) {
|
||||
bool quadratic = (kConfigRho < 1.0f);
|
||||
bool quadratic = (kRho < 1.f);
|
||||
const float* pow_x0 = filtered_clear_pow_.get();
|
||||
const float* pow_n0 = filtered_noise_pow_.get();
|
||||
|
||||
for (size_t n = 0; n < start_freq; ++n) {
|
||||
sols[n] = 1.0f;
|
||||
sols[n] = 1.f;
|
||||
}
|
||||
|
||||
// Analytic solution for optimal gains. See paper for derivation.
|
||||
for (size_t n = start_freq - 1; n < bank_size_; ++n) {
|
||||
float alpha0, beta0, gamma0;
|
||||
gamma0 = 0.5f * rho_[n] * pow_x0[n] * pow_n0[n] +
|
||||
gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +
|
||||
lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];
|
||||
beta0 = lambda * pow_x0[n] * (2 - rho_[n]) * pow_x0[n] * pow_n0[n];
|
||||
beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];
|
||||
if (quadratic) {
|
||||
alpha0 = lambda * pow_x0[n] * (1 - rho_[n]) * pow_x0[n] * pow_x0[n];
|
||||
alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];
|
||||
sols[n] =
|
||||
(-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /
|
||||
(2 * alpha0 + std::numeric_limits<float>::epsilon());
|
||||
@ -355,8 +312,15 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
|
||||
}
|
||||
}
|
||||
|
||||
bool IntelligibilityEnhancer::active() const {
|
||||
return active_;
|
||||
bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
|
||||
FloatToS16(audio, chunk_length_, &audio_s16_[0]);
|
||||
vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
|
||||
if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
|
||||
chunks_since_voice_ = 0;
|
||||
} else if (chunks_since_voice_ < kSpeechOffsetDelay) {
|
||||
++chunks_since_voice_;
|
||||
}
|
||||
return chunks_since_voice_ < kSpeechOffsetDelay;
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include "webrtc/common_audio/lapped_transform.h"
|
||||
#include "webrtc/common_audio/channel_buffer.h"
|
||||
#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
|
||||
#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
@ -28,28 +29,7 @@ namespace webrtc {
|
||||
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
|
||||
class IntelligibilityEnhancer {
|
||||
public:
|
||||
struct Config {
|
||||
// TODO(bercic): the |decay_rate|, |analysis_rate| and |gain_limit|
|
||||
// parameters should probably go away once fine tuning is done.
|
||||
Config()
|
||||
: sample_rate_hz(16000),
|
||||
num_capture_channels(1),
|
||||
num_render_channels(1),
|
||||
decay_rate(0.9f),
|
||||
analysis_rate(60),
|
||||
gain_change_limit(0.1f),
|
||||
rho(0.02f) {}
|
||||
int sample_rate_hz;
|
||||
size_t num_capture_channels;
|
||||
size_t num_render_channels;
|
||||
float decay_rate;
|
||||
int analysis_rate;
|
||||
float gain_change_limit;
|
||||
float rho;
|
||||
};
|
||||
|
||||
explicit IntelligibilityEnhancer(const Config& config);
|
||||
IntelligibilityEnhancer(); // Initialize with default config.
|
||||
IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);
|
||||
|
||||
// Sets the capture noise magnitude spectrum estimate.
|
||||
void SetCaptureNoiseEstimate(std::vector<float> noise);
|
||||
@ -86,9 +66,6 @@ class IntelligibilityEnhancer {
|
||||
void ProcessClearBlock(const std::complex<float>* in_block,
|
||||
std::complex<float>* out_block);
|
||||
|
||||
// Computes and sets modified gains.
|
||||
void AnalyzeClearBlock();
|
||||
|
||||
// Bisection search for optimal |lambda|.
|
||||
void SolveForLambda(float power_target, float power_bot, float power_top);
|
||||
|
||||
@ -105,29 +82,25 @@ class IntelligibilityEnhancer {
|
||||
// Negative gains are set to 0. Stores the results in |sols|.
|
||||
void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);
|
||||
|
||||
// Returns true if the audio is speech.
|
||||
bool IsSpeech(const float* audio);
|
||||
|
||||
const size_t freqs_; // Num frequencies in frequency domain.
|
||||
const size_t window_size_; // Window size in samples; also the block size.
|
||||
const size_t chunk_length_; // Chunk size in samples.
|
||||
const size_t bank_size_; // Num ERB filters.
|
||||
const int sample_rate_hz_;
|
||||
const int erb_resolution_;
|
||||
const size_t num_capture_channels_;
|
||||
const size_t num_render_channels_;
|
||||
const int analysis_rate_; // Num blocks before gains recalculated.
|
||||
|
||||
const bool active_; // Whether render gains are being updated.
|
||||
// TODO(ekm): Add logic for updating |active_|.
|
||||
|
||||
intelligibility::PowerEstimator clear_power_;
|
||||
std::vector<float> noise_power_;
|
||||
intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
|
||||
std::unique_ptr<intelligibility::PowerEstimator<float>>
|
||||
noise_power_estimator_;
|
||||
std::unique_ptr<float[]> filtered_clear_pow_;
|
||||
std::unique_ptr<float[]> filtered_noise_pow_;
|
||||
std::unique_ptr<float[]> center_freqs_;
|
||||
std::vector<std::vector<float>> capture_filter_bank_;
|
||||
std::vector<std::vector<float>> render_filter_bank_;
|
||||
size_t start_freq_;
|
||||
std::unique_ptr<float[]> rho_; // Production and interpretation SNR.
|
||||
// for each ERB band.
|
||||
|
||||
std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.
|
||||
intelligibility::GainApplier gain_applier_;
|
||||
|
||||
@ -135,11 +108,13 @@ class IntelligibilityEnhancer {
|
||||
// the original input array with modifications.
|
||||
ChannelBuffer<float> temp_render_out_buffer_;
|
||||
|
||||
std::unique_ptr<float[]> kbd_window_;
|
||||
TransformCallback render_callback_;
|
||||
std::unique_ptr<LappedTransform> render_mangler_;
|
||||
int block_count_;
|
||||
int analysis_step_;
|
||||
|
||||
VoiceActivityDetector vad_;
|
||||
std::vector<int16_t> audio_s16_;
|
||||
size_t chunks_since_voice_;
|
||||
bool is_speech_;
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -26,54 +26,184 @@ namespace {
|
||||
|
||||
// Target output for ERB create test. Generated with matlab.
|
||||
const float kTestCenterFreqs[] = {
|
||||
13.169f, 26.965f, 41.423f, 56.577f, 72.461f, 89.113f, 106.57f, 124.88f,
|
||||
144.08f, 164.21f, 185.34f, 207.5f, 230.75f, 255.16f, 280.77f, 307.66f,
|
||||
335.9f, 365.56f, 396.71f, 429.44f, 463.84f, 500.f};
|
||||
const float kTestFilterBank[][9] = {
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.5f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.5f}};
|
||||
14.5213f, 29.735f, 45.6781f, 62.3884f, 79.9058f, 98.2691f, 117.521f,
|
||||
137.708f, 158.879f, 181.084f, 204.378f, 228.816f, 254.459f, 281.371f,
|
||||
309.618f, 339.273f, 370.411f, 403.115f, 437.469f, 473.564f, 511.497f,
|
||||
551.371f, 593.293f, 637.386f, 683.77f, 732.581f, 783.96f, 838.06f,
|
||||
895.046f, 955.09f, 1018.38f, 1085.13f, 1155.54f, 1229.85f, 1308.32f,
|
||||
1391.22f, 1478.83f, 1571.5f, 1669.55f, 1773.37f, 1883.37f, 2000.f};
|
||||
const float kTestFilterBank[][33] = {
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.157895f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.210526f, 0.117647f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.315789f, 0.176471f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.315789f, 0.352941f, 0.142857f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.352941f, 0.285714f,
|
||||
0.157895f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f,
|
||||
0.210526f, 0.111111f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.285714f, 0.315789f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.315789f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f,
|
||||
0.108108f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f,
|
||||
0.243243f, 0.153846f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f,
|
||||
0.324324f, 0.230769f, 0.166667f, 0.0909091f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.324324f, 0.307692f, 0.25f, 0.181818f, 0.0833333f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.307692f, 0.333333f,
|
||||
0.363636f, 0.25f, 0.151515f, 0.0793651f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.166667f, 0.363636f, 0.333333f, 0.242424f,
|
||||
0.190476f, 0.133333f, 0.0689655f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.333333f, 0.30303f, 0.253968f, 0.2f, 0.137931f,
|
||||
0.0714286f, 0.f, 0.f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.30303f, 0.31746f, 0.333333f, 0.275862f, 0.214286f,
|
||||
0.125f, 0.0655738f, 0.f, 0.f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.15873f, 0.333333f, 0.344828f, 0.357143f,
|
||||
0.25f, 0.196721f, 0.137931f, 0.0816327f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.172414f, 0.357143f,
|
||||
0.3125f, 0.245902f, 0.172414f, 0.102041f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.3125f, 0.327869f, 0.344828f, 0.204082f, 0.f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.163934f, 0.344828f, 0.408163f, 0.5f},
|
||||
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.204082f, 0.5f}};
|
||||
static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestFilterBank),
|
||||
"Test filterbank badly initialized.");
|
||||
|
||||
// Target output for gain solving test. Generated with matlab.
|
||||
const size_t kTestStartFreq = 12; // Lowest integral frequency for ERBs.
|
||||
const float kTestZeroVar[] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
|
||||
1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
const float kTestZeroVar[] = {
|
||||
1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0};
|
||||
static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestZeroVar),
|
||||
"Power test data badly initialized.");
|
||||
const float kTestNonZeroVarLambdaTop[] = {
|
||||
1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
|
||||
1.f, 1.f, 1.f, 0.f, 0.f, 0.0351f, 0.0636f, 0.0863f,
|
||||
0.1037f, 0.1162f, 0.1236f, 0.1251f, 0.1189f, 0.0993f};
|
||||
1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0};
|
||||
static_assert(arraysize(kTestCenterFreqs) ==
|
||||
arraysize(kTestNonZeroVarLambdaTop),
|
||||
"Power test data badly initialized.");
|
||||
const float kMaxTestError = 0.005f;
|
||||
|
||||
// Enhancer initialization parameters.
|
||||
const int kSamples = 2000;
|
||||
const int kSampleRate = 1000;
|
||||
const int kSamples = 1000;
|
||||
const int kSampleRate = 4000;
|
||||
const int kNumChannels = 1;
|
||||
const int kFragmentSize = kSampleRate / 100;
|
||||
|
||||
@ -83,13 +213,11 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
|
||||
protected:
|
||||
IntelligibilityEnhancerTest()
|
||||
: clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) {
|
||||
config_.sample_rate_hz = kSampleRate;
|
||||
enh_.reset(new IntelligibilityEnhancer(config_));
|
||||
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
|
||||
}
|
||||
|
||||
bool CheckUpdate() {
|
||||
config_.sample_rate_hz = kSampleRate;
|
||||
enh_.reset(new IntelligibilityEnhancer(config_));
|
||||
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
|
||||
float* clear_cursor = &clear_data_[0];
|
||||
float* noise_cursor = &noise_data_[0];
|
||||
for (int i = 0; i < kSamples; i += kFragmentSize) {
|
||||
@ -105,7 +233,6 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
|
||||
return false;
|
||||
}
|
||||
|
||||
IntelligibilityEnhancer::Config config_;
|
||||
std::unique_ptr<IntelligibilityEnhancer> enh_;
|
||||
std::vector<float> clear_data_;
|
||||
std::vector<float> noise_data_;
|
||||
@ -115,9 +242,9 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
|
||||
// For each class of generated data, tests that render stream is updated when
|
||||
// it should be.
|
||||
TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) {
|
||||
std::fill(noise_data_.begin(), noise_data_.end(), 0.0f);
|
||||
std::fill(orig_data_.begin(), orig_data_.end(), 0.0f);
|
||||
std::fill(clear_data_.begin(), clear_data_.end(), 0.0f);
|
||||
std::fill(noise_data_.begin(), noise_data_.end(), 0.f);
|
||||
std::fill(orig_data_.begin(), orig_data_.end(), 0.f);
|
||||
std::fill(clear_data_.begin(), clear_data_.end(), 0.f);
|
||||
EXPECT_FALSE(CheckUpdate());
|
||||
std::srand(1);
|
||||
auto float_rand = []() { return std::rand() * 2.f / RAND_MAX - 1; };
|
||||
@ -148,9 +275,8 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) {
|
||||
std::vector<float> sols(enh_->bank_size_);
|
||||
float lambda = -0.001f;
|
||||
for (size_t i = 0; i < enh_->bank_size_; i++) {
|
||||
enh_->filtered_clear_pow_[i] = 0.0f;
|
||||
enh_->filtered_noise_pow_[i] = 0.0f;
|
||||
enh_->rho_[i] = 0.02f;
|
||||
enh_->filtered_clear_pow_[i] = 0.f;
|
||||
enh_->filtered_noise_pow_[i] = 0.f;
|
||||
}
|
||||
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
|
||||
for (size_t i = 0; i < enh_->bank_size_; i++) {
|
||||
@ -164,7 +290,7 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) {
|
||||
for (size_t i = 0; i < enh_->bank_size_; i++) {
|
||||
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
|
||||
}
|
||||
lambda = -1.0;
|
||||
lambda = -1.f;
|
||||
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
|
||||
for (size_t i = 0; i < enh_->bank_size_; i++) {
|
||||
EXPECT_NEAR(kTestZeroVar[i], sols[i], kMaxTestError);
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
@ -21,45 +22,38 @@ namespace intelligibility {
|
||||
|
||||
namespace {
|
||||
|
||||
// Return |current| changed towards |target|, with the change being at most
|
||||
// |limit|.
|
||||
// Return |current| changed towards |target|, with the relative change being at
|
||||
// most |limit|.
|
||||
float UpdateFactor(float target, float current, float limit) {
|
||||
float delta = fabsf(target - current);
|
||||
float sign = copysign(1.f, target - current);
|
||||
return current + sign * fminf(delta, limit);
|
||||
float gain = target / (current + std::numeric_limits<float>::epsilon());
|
||||
if (gain < 1.f - limit) {
|
||||
gain = 1.f - limit;
|
||||
} else if (gain > 1.f + limit) {
|
||||
gain = 1.f + limit;
|
||||
}
|
||||
return current * gain + std::numeric_limits<float>::epsilon();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
PowerEstimator::PowerEstimator(size_t num_freqs,
|
||||
float decay)
|
||||
: magnitude_(new float[num_freqs]()),
|
||||
power_(new float[num_freqs]()),
|
||||
num_freqs_(num_freqs),
|
||||
decay_(decay) {
|
||||
memset(magnitude_.get(), 0, sizeof(*magnitude_.get()) * num_freqs_);
|
||||
memset(power_.get(), 0, sizeof(*power_.get()) * num_freqs_);
|
||||
}
|
||||
template<typename T>
|
||||
PowerEstimator<T>::PowerEstimator(size_t num_freqs, float decay)
|
||||
: power_(num_freqs, 0.f), decay_(decay) {}
|
||||
|
||||
// Compute the magnitude from the beginning, with exponential decaying of the
|
||||
// series data.
|
||||
void PowerEstimator::Step(const std::complex<float>* data) {
|
||||
for (size_t i = 0; i < num_freqs_; ++i) {
|
||||
magnitude_[i] = decay_ * magnitude_[i] +
|
||||
(1.f - decay_) * std::abs(data[i]);
|
||||
template<typename T>
|
||||
void PowerEstimator<T>::Step(const T* data) {
|
||||
for (size_t i = 0; i < power_.size(); ++i) {
|
||||
power_[i] = decay_ * power_[i] +
|
||||
(1.f - decay_) * std::abs(data[i]) * std::abs(data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
const float* PowerEstimator::Power() {
|
||||
for (size_t i = 0; i < num_freqs_; ++i) {
|
||||
power_[i] = magnitude_[i] * magnitude_[i];
|
||||
}
|
||||
return &power_[0];
|
||||
}
|
||||
template class PowerEstimator<float>;
|
||||
template class PowerEstimator<std::complex<float>>;
|
||||
|
||||
GainApplier::GainApplier(size_t freqs, float change_limit)
|
||||
GainApplier::GainApplier(size_t freqs, float relative_change_limit)
|
||||
: num_freqs_(freqs),
|
||||
change_limit_(change_limit),
|
||||
relative_change_limit_(relative_change_limit),
|
||||
target_(new float[freqs]()),
|
||||
current_(new float[freqs]()) {
|
||||
for (size_t i = 0; i < freqs; ++i) {
|
||||
@ -71,12 +65,8 @@ GainApplier::GainApplier(size_t freqs, float change_limit)
|
||||
void GainApplier::Apply(const std::complex<float>* in_block,
|
||||
std::complex<float>* out_block) {
|
||||
for (size_t i = 0; i < num_freqs_; ++i) {
|
||||
float factor = sqrtf(fabsf(current_[i]));
|
||||
if (!std::isnormal(factor)) {
|
||||
factor = 1.f;
|
||||
}
|
||||
out_block[i] = factor * in_block[i];
|
||||
current_[i] = UpdateFactor(target_[i], current_[i], change_limit_);
|
||||
current_[i] = UpdateFactor(target_[i], current_[i], relative_change_limit_);
|
||||
out_block[i] = sqrtf(fabsf(current_[i])) * in_block[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
|
||||
#include <complex>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
@ -21,6 +22,7 @@ namespace intelligibility {
|
||||
// Internal helper for computing the power of a stream of arrays.
|
||||
// The result is an array of power per position: the i-th power is the power of
|
||||
// the stream of data on the i-th positions in the input arrays.
|
||||
template <typename T>
|
||||
class PowerEstimator {
|
||||
public:
|
||||
// Construct an instance for the given input array length (|freqs|), with the
|
||||
@ -28,31 +30,24 @@ class PowerEstimator {
|
||||
PowerEstimator(size_t freqs, float decay);
|
||||
|
||||
// Add a new data point to the series.
|
||||
void Step(const std::complex<float>* data);
|
||||
void Step(const T* data);
|
||||
|
||||
// The current power array.
|
||||
const float* Power();
|
||||
const std::vector<float>& power() { return power_; };
|
||||
|
||||
private:
|
||||
// TODO(ekmeyerson): Switch the following running means
|
||||
// and histories from std::unique_ptr to std::vector.
|
||||
std::unique_ptr<std::complex<float>[]> running_mean_sq_;
|
||||
|
||||
// The current magnitude array.
|
||||
std::unique_ptr<float[]> magnitude_;
|
||||
// The current power array.
|
||||
std::unique_ptr<float[]> power_;
|
||||
std::vector<float> power_;
|
||||
|
||||
const size_t num_freqs_;
|
||||
const float decay_;
|
||||
};
|
||||
|
||||
// Helper class for smoothing gain changes. On each application step, the
|
||||
// currently used gains are changed towards a set of settable target gains,
|
||||
// constrained by a limit on the magnitude of the changes.
|
||||
// constrained by a limit on the relative changes.
|
||||
class GainApplier {
|
||||
public:
|
||||
GainApplier(size_t freqs, float change_limit);
|
||||
GainApplier(size_t freqs, float relative_change_limit);
|
||||
|
||||
// Copy |in_block| to |out_block|, multiplied by the current set of gains,
|
||||
// and step the current set of gains towards the target set.
|
||||
@ -64,7 +59,7 @@ class GainApplier {
|
||||
|
||||
private:
|
||||
const size_t num_freqs_;
|
||||
const float change_limit_;
|
||||
const float relative_change_limit_;
|
||||
std::unique_ptr<float[]> target_;
|
||||
std::unique_ptr<float[]> current_;
|
||||
};
|
||||
|
||||
@ -39,17 +39,16 @@ TEST(IntelligibilityUtilsTest, TestPowerEstimator) {
|
||||
const float kDecay = 0.5f;
|
||||
const std::vector<std::vector<std::complex<float>>> test_data(
|
||||
GenerateTestData(kFreqs, kSamples));
|
||||
PowerEstimator power_estimator(kFreqs, kDecay);
|
||||
EXPECT_EQ(0, power_estimator.Power()[0]);
|
||||
PowerEstimator<std::complex<float>> power_estimator(kFreqs, kDecay);
|
||||
EXPECT_EQ(0, power_estimator.power()[0]);
|
||||
|
||||
// Makes sure Step is doing something.
|
||||
power_estimator.Step(&test_data[0][0]);
|
||||
for (size_t i = 1; i < kSamples; ++i) {
|
||||
power_estimator.Step(&test_data[i][0]);
|
||||
for (size_t j = 0; j < kFreqs; ++j) {
|
||||
const float* power = power_estimator.Power();
|
||||
EXPECT_GE(power[j], 0.f);
|
||||
EXPECT_LE(power[j], 1.f);
|
||||
EXPECT_GE(power_estimator.power()[j], 0.f);
|
||||
EXPECT_LE(power_estimator.power()[j], 1.f);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -62,8 +61,8 @@ TEST(IntelligibilityUtilsTest, TestGainApplier) {
|
||||
GainApplier gain_applier(kFreqs, kChangeLimit);
|
||||
const std::vector<std::vector<std::complex<float>>> in_data(
|
||||
GenerateTestData(kFreqs, kSamples));
|
||||
std::vector<std::vector<std::complex<float>>> out_data(GenerateTestData(
|
||||
kFreqs, kSamples));
|
||||
std::vector<std::vector<std::complex<float>>> out_data(
|
||||
GenerateTestData(kFreqs, kSamples));
|
||||
for (size_t i = 0; i < kSamples; ++i) {
|
||||
gain_applier.Apply(&in_data[i][0], &out_data[i][0]);
|
||||
for (size_t j = 0; j < kFreqs; ++j) {
|
||||
|
||||
@ -30,44 +30,24 @@ using std::complex;
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
|
||||
DEFINE_double(clear_alpha, 0.9, "Power decay factor for clear data.");
|
||||
DEFINE_int32(sample_rate,
|
||||
16000,
|
||||
"Audio sample rate used in the input and output files.");
|
||||
DEFINE_int32(ana_rate,
|
||||
60,
|
||||
"Analysis rate; gains recalculated every N blocks.");
|
||||
DEFINE_double(gain_limit, 1000.0, "Maximum gain change in one block.");
|
||||
|
||||
DEFINE_string(clear_file, "speech.wav", "Input file with clear speech.");
|
||||
DEFINE_string(noise_file, "noise.wav", "Input file with noise data.");
|
||||
DEFINE_string(out_file, "proc_enhanced.wav", "Enhanced output file.");
|
||||
|
||||
const size_t kNumChannels = 1;
|
||||
|
||||
// void function for gtest
|
||||
void void_main(int argc, char* argv[]) {
|
||||
google::SetUsageMessage(
|
||||
"\n\nInput files must be little-endian 16-bit signed raw PCM.\n");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
|
||||
size_t samples; // Number of samples in input PCM file
|
||||
size_t fragment_size; // Number of samples to process at a time
|
||||
// to simulate APM stream processing
|
||||
|
||||
// Load settings and wav input.
|
||||
|
||||
fragment_size = FLAGS_sample_rate / 100; // Mirror real time APM chunk size.
|
||||
// Duplicates chunk_length_ in
|
||||
// IntelligibilityEnhancer.
|
||||
|
||||
struct stat in_stat, noise_stat;
|
||||
ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)
|
||||
<< "Empty speech file.";
|
||||
ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)
|
||||
<< "Empty noise file.";
|
||||
|
||||
samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
|
||||
const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
|
||||
|
||||
WavReader in_file(FLAGS_clear_file);
|
||||
std::vector<float> in_fpcm(samples);
|
||||
@ -80,23 +60,19 @@ void void_main(int argc, char* argv[]) {
|
||||
FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]);
|
||||
|
||||
// Run intelligibility enhancement.
|
||||
IntelligibilityEnhancer::Config config;
|
||||
config.sample_rate_hz = FLAGS_sample_rate;
|
||||
config.decay_rate = static_cast<float>(FLAGS_clear_alpha);
|
||||
config.analysis_rate = FLAGS_ana_rate;
|
||||
config.gain_change_limit = FLAGS_gain_limit;
|
||||
IntelligibilityEnhancer enh(config);
|
||||
IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());
|
||||
rtc::CriticalSection crit;
|
||||
NoiseSuppressionImpl ns(&crit);
|
||||
ns.Initialize(kNumChannels, FLAGS_sample_rate);
|
||||
ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());
|
||||
ns.Enable(true);
|
||||
|
||||
AudioBuffer capture_audio(fragment_size,
|
||||
kNumChannels,
|
||||
fragment_size,
|
||||
kNumChannels,
|
||||
// Mirror real time APM chunk size. Duplicates chunk_length_ in
|
||||
// IntelligibilityEnhancer.
|
||||
size_t fragment_size = in_file.sample_rate() / 100;
|
||||
AudioBuffer capture_audio(fragment_size, noise_file.num_channels(),
|
||||
fragment_size, noise_file.num_channels(),
|
||||
fragment_size);
|
||||
StreamConfig stream_config(FLAGS_sample_rate, kNumChannels);
|
||||
StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels());
|
||||
|
||||
// Slice the input into smaller chunks, as the APM would do, and feed them
|
||||
// through the enhancer.
|
||||
@ -108,14 +84,17 @@ void void_main(int argc, char* argv[]) {
|
||||
ns.AnalyzeCaptureAudio(&capture_audio);
|
||||
ns.ProcessCaptureAudio(&capture_audio);
|
||||
enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());
|
||||
enh.ProcessRenderAudio(&clear_cursor, FLAGS_sample_rate, kNumChannels);
|
||||
enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(),
|
||||
in_file.num_channels());
|
||||
clear_cursor += fragment_size;
|
||||
noise_cursor += fragment_size;
|
||||
}
|
||||
|
||||
FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]);
|
||||
|
||||
WavWriter out_file(FLAGS_out_file, FLAGS_sample_rate, kNumChannels);
|
||||
WavWriter out_file(FLAGS_out_file,
|
||||
in_file.sample_rate(),
|
||||
in_file.num_channels());
|
||||
out_file.WriteSamples(&in_fpcm[0], samples);
|
||||
}
|
||||
|
||||
|
||||
@ -182,8 +182,8 @@ std::vector<float> NoiseSuppressionImpl::NoiseEstimate() {
|
||||
for (auto& suppressor : suppressors_) {
|
||||
const float* noise = WebRtcNs_noise_estimate(suppressor->state());
|
||||
for (size_t i = 0; i < noise_estimate.size(); ++i) {
|
||||
noise_estimate[i] += kNormalizationFactor *
|
||||
noise[i] / suppressors_.size();
|
||||
noise_estimate[i] +=
|
||||
kNormalizationFactor * noise[i] / suppressors_.size();
|
||||
}
|
||||
}
|
||||
#elif defined(WEBRTC_NS_FIXED)
|
||||
|
||||
Reference in New Issue
Block a user