Use VAD to get a better speech power estimation in the IntelligibilityEnhancer

R=henrik.lundin@webrtc.org, turaj@webrtc.org

Review URL: https://codereview.webrtc.org/1693823004 .

Cr-Commit-Position: refs/heads/master@{#11713}
This commit is contained in:
Alejandro Luebs
2016-02-22 15:57:38 -08:00
parent 67b81f92f4
commit 18fcbcf48c
9 changed files with 330 additions and 306 deletions

View File

@ -1184,8 +1184,7 @@ bool AudioProcessingImpl::analysis_needed(bool is_data_processed) const {
}
bool AudioProcessingImpl::is_rev_processed() const {
return constants_.intelligibility_enabled &&
public_submodules_->intelligibility_enhancer->active();
return constants_.intelligibility_enabled;
}
bool AudioProcessingImpl::render_check_rev_conversion_needed() const {
@ -1236,12 +1235,9 @@ void AudioProcessingImpl::InitializeBeamformer() {
void AudioProcessingImpl::InitializeIntelligibility() {
if (constants_.intelligibility_enabled) {
IntelligibilityEnhancer::Config config;
config.sample_rate_hz = capture_nonlocked_.split_rate;
config.num_capture_channels = capture_.capture_audio->num_channels();
config.num_render_channels = render_.render_audio->num_channels();
public_submodules_->intelligibility_enhancer.reset(
new IntelligibilityEnhancer(config));
new IntelligibilityEnhancer(capture_nonlocked_.split_rate,
render_.render_audio->num_channels()));
}
}

View File

@ -27,11 +27,16 @@ namespace {
const size_t kErbResolution = 2;
const int kWindowSizeMs = 16;
const int kChunkSizeMs = 10; // Size provided by APM.
const float kClipFreq = 200.0f;
const float kConfigRho = 0.02f; // Default production and interpretation SNR.
const float kClipFreqKhz = 0.2f;
const float kKbdAlpha = 1.5f;
const float kLambdaBot = -1.0f; // Extreme values in bisection
const float kLambdaTop = -10e-18f; // search for lamda.
const float kVoiceProbabilityThreshold = 0.02f;
// Number of chunks after voice activity which is still considered speech.
const size_t kSpeechOffsetDelay = 80;
const float kDecayRate = 0.98f; // Power estimation decay rate.
const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.
const float kRho = 0.0004f; // Default production and interpretation SNR.
// Returns dot product of vectors |a| and |b| with size |length|.
float DotProduct(const float* a, const float* b, size_t length) {
@ -72,61 +77,46 @@ void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
}
}
IntelligibilityEnhancer::IntelligibilityEnhancer()
: IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {
}
IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
size_t num_render_channels)
: freqs_(RealFourier::ComplexLength(
RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),
window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))),
chunk_length_(
static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)),
bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),
sample_rate_hz_(config.sample_rate_hz),
erb_resolution_(kErbResolution),
num_capture_channels_(config.num_capture_channels),
num_render_channels_(config.num_render_channels),
analysis_rate_(config.analysis_rate),
active_(true),
clear_power_(freqs_, config.decay_rate),
noise_power_(freqs_, 0.f),
RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
sample_rate_hz_(sample_rate_hz),
num_render_channels_(num_render_channels),
clear_power_estimator_(freqs_, kDecayRate),
noise_power_estimator_(
new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
filtered_clear_pow_(new float[bank_size_]),
filtered_noise_pow_(new float[bank_size_]),
center_freqs_(new float[bank_size_]),
render_filter_bank_(CreateErbBank(freqs_)),
rho_(new float[bank_size_]),
gains_eq_(new float[bank_size_]),
gain_applier_(freqs_, config.gain_change_limit),
gain_applier_(freqs_, kMaxRelativeGainChange),
temp_render_out_buffer_(chunk_length_, num_render_channels_),
kbd_window_(new float[window_size_]),
render_callback_(this),
block_count_(0),
analysis_step_(0) {
RTC_DCHECK_LE(config.rho, 1.0f);
audio_s16_(chunk_length_),
chunks_since_voice_(kSpeechOffsetDelay),
is_speech_(false) {
RTC_DCHECK_LE(kRho, 1.f);
memset(filtered_clear_pow_.get(),
0,
memset(filtered_clear_pow_.get(), 0,
bank_size_ * sizeof(filtered_clear_pow_[0]));
memset(filtered_noise_pow_.get(),
0,
memset(filtered_noise_pow_.get(), 0,
bank_size_ * sizeof(filtered_noise_pow_[0]));
// Assumes all rho equal.
for (size_t i = 0; i < bank_size_; ++i) {
rho_[i] = config.rho * config.rho;
}
const size_t erb_index = static_cast<size_t>(
ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
43.f));
start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
float freqs_khz = kClipFreq / 1000.0f;
size_t erb_index = static_cast<size_t>(ceilf(
11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));
start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
kbd_window_.get());
size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
std::vector<float> kbd_window(window_size);
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
render_mangler_.reset(new LappedTransform(
num_render_channels_, num_render_channels_, chunk_length_,
kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));
num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
window_size, window_size / 2, &render_callback_));
}
void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
@ -134,13 +124,10 @@ void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
if (capture_filter_bank_.size() != bank_size_ ||
capture_filter_bank_[0].size() != noise.size()) {
capture_filter_bank_ = CreateErbBank(noise.size());
noise_power_estimator_.reset(
new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
}
if (noise.size() != noise_power_.size()) {
noise_power_.resize(noise.size());
}
for (size_t i = 0; i < noise.size(); ++i) {
noise_power_[i] = noise[i] * noise[i];
}
noise_power_estimator_->Step(&noise[0]);
}
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
@ -148,54 +135,29 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
size_t num_channels) {
RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
RTC_CHECK_EQ(num_render_channels_, num_channels);
if (active_) {
is_speech_ = IsSpeech(audio[0]);
render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
}
if (active_) {
for (size_t i = 0; i < num_render_channels_; ++i) {
memcpy(audio[i], temp_render_out_buffer_.channels()[i],
chunk_length_ * sizeof(**audio));
}
}
}
void IntelligibilityEnhancer::ProcessClearBlock(
const std::complex<float>* in_block,
std::complex<float>* out_block) {
if (block_count_ < 2) {
memset(out_block, 0, freqs_ * sizeof(*out_block));
++block_count_;
return;
if (is_speech_) {
clear_power_estimator_.Step(in_block);
}
// TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary.
if (true) {
clear_power_.Step(in_block);
if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
AnalyzeClearBlock();
++analysis_step_;
}
++block_count_;
}
if (active_) {
gain_applier_.Apply(in_block, out_block);
}
}
void IntelligibilityEnhancer::AnalyzeClearBlock() {
const float* clear_power = clear_power_.Power();
MapToErbBands(clear_power,
render_filter_bank_,
const std::vector<float>& clear_power = clear_power_estimator_.power();
const std::vector<float>& noise_power = noise_power_estimator_->power();
MapToErbBands(&clear_power[0], render_filter_bank_,
filtered_clear_pow_.get());
MapToErbBands(&noise_power_[0],
capture_filter_bank_,
MapToErbBands(&noise_power[0], capture_filter_bank_,
filtered_noise_pow_.get());
SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
const float power_target = std::accumulate(
clear_power, clear_power + freqs_, 0.f);
const float power_target =
std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
const float power_top =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
@ -205,6 +167,7 @@ void IntelligibilityEnhancer::AnalyzeClearBlock() {
SolveForLambda(power_target, power_bot, power_top);
UpdateErbGains();
} // Else experiencing power underflow, so do nothing.
gain_applier_.Apply(in_block, out_block);
}
void IntelligibilityEnhancer::SolveForLambda(float power_target,
@ -217,11 +180,10 @@ void IntelligibilityEnhancer::SolveForLambda(float power_target,
1.f / (power_target + std::numeric_limits<float>::epsilon());
float lambda_bot = kLambdaBot;
float lambda_top = kLambdaTop;
float power_ratio = 2.0f; // Ratio of achieved power to target power.
float power_ratio = 2.f; // Ratio of achieved power to target power.
int iters = 0;
while (std::fabs(power_ratio - 1.0f) > kConvergeThresh &&
iters <= kMaxIters) {
const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;
while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;
SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
const float power =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
@ -239,7 +201,7 @@ void IntelligibilityEnhancer::UpdateErbGains() {
// (ERB gain) = filterbank' * (freq gain)
float* gains = gain_applier_.target();
for (size_t i = 0; i < freqs_; ++i) {
gains[i] = 0.0f;
gains[i] = 0.f;
for (size_t j = 0; j < bank_size_; ++j) {
gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);
}
@ -248,9 +210,9 @@ void IntelligibilityEnhancer::UpdateErbGains() {
size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,
size_t erb_resolution) {
float freq_limit = sample_rate / 2000.0f;
float freq_limit = sample_rate / 2000.f;
size_t erb_scale = static_cast<size_t>(ceilf(
11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f));
11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));
return erb_scale * erb_resolution;
}
@ -260,7 +222,7 @@ std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
size_t lf = 1, rf = 4;
for (size_t i = 0; i < bank_size_; ++i) {
float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));
float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));
center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));
center_freqs_[i] -= 14678.49f;
}
@ -274,48 +236,43 @@ std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
}
for (size_t i = 1; i <= bank_size_; ++i) {
size_t lll, ll, rr, rrr;
static const size_t kOne = 1; // Avoids repeated static_cast<>s below.
lll = static_cast<size_t>(round(
center_freqs_[std::max(kOne, i - lf) - 1] * num_freqs /
(0.5f * sample_rate_hz_)));
ll = static_cast<size_t>(round(
center_freqs_[std::max(kOne, i) - 1] * num_freqs /
(0.5f * sample_rate_hz_)));
size_t lll =
static_cast<size_t>(round(center_freqs_[std::max(kOne, i - lf) - 1] *
num_freqs / (0.5f * sample_rate_hz_)));
size_t ll = static_cast<size_t>(round(center_freqs_[std::max(kOne, i) - 1] *
num_freqs / (0.5f * sample_rate_hz_)));
lll = std::min(num_freqs, std::max(lll, kOne)) - 1;
ll = std::min(num_freqs, std::max(ll, kOne)) - 1;
rrr = static_cast<size_t>(round(
center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
size_t rrr = static_cast<size_t>(
round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
(0.5f * sample_rate_hz_)));
rr = static_cast<size_t>(round(
center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
size_t rr = static_cast<size_t>(
round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
(0.5f * sample_rate_hz_)));
rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1;
rr = std::min(num_freqs, std::max(rr, kOne)) - 1;
float step, element;
step = ll == lll ? 0.f : 1.f / (ll - lll);
element = 0.0f;
float step = ll == lll ? 0.f : 1.f / (ll - lll);
float element = 0.f;
for (size_t j = lll; j <= ll; ++j) {
filter_bank[i - 1][j] = element;
element += step;
}
step = rr == rrr ? 0.f : 1.f / (rrr - rr);
element = 1.0f;
element = 1.f;
for (size_t j = rr; j <= rrr; ++j) {
filter_bank[i - 1][j] = element;
element -= step;
}
for (size_t j = ll; j <= rr; ++j) {
filter_bank[i - 1][j] = 1.0f;
filter_bank[i - 1][j] = 1.f;
}
}
float sum;
for (size_t i = 0; i < num_freqs; ++i) {
sum = 0.0f;
float sum = 0.f;
for (size_t j = 0; j < bank_size_; ++j) {
sum += filter_bank[j][i];
}
@ -329,22 +286,22 @@ std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank(
void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
size_t start_freq,
float* sols) {
bool quadratic = (kConfigRho < 1.0f);
bool quadratic = (kRho < 1.f);
const float* pow_x0 = filtered_clear_pow_.get();
const float* pow_n0 = filtered_noise_pow_.get();
for (size_t n = 0; n < start_freq; ++n) {
sols[n] = 1.0f;
sols[n] = 1.f;
}
// Analytic solution for optimal gains. See paper for derivation.
for (size_t n = start_freq - 1; n < bank_size_; ++n) {
float alpha0, beta0, gamma0;
gamma0 = 0.5f * rho_[n] * pow_x0[n] * pow_n0[n] +
gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +
lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];
beta0 = lambda * pow_x0[n] * (2 - rho_[n]) * pow_x0[n] * pow_n0[n];
beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];
if (quadratic) {
alpha0 = lambda * pow_x0[n] * (1 - rho_[n]) * pow_x0[n] * pow_x0[n];
alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];
sols[n] =
(-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /
(2 * alpha0 + std::numeric_limits<float>::epsilon());
@ -355,8 +312,15 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
}
}
bool IntelligibilityEnhancer::active() const {
return active_;
bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
FloatToS16(audio, chunk_length_, &audio_s16_[0]);
vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
chunks_since_voice_ = 0;
} else if (chunks_since_voice_ < kSpeechOffsetDelay) {
++chunks_since_voice_;
}
return chunks_since_voice_ < kSpeechOffsetDelay;
}
} // namespace webrtc

View File

@ -18,6 +18,7 @@
#include "webrtc/common_audio/lapped_transform.h"
#include "webrtc/common_audio/channel_buffer.h"
#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
namespace webrtc {
@ -28,28 +29,7 @@ namespace webrtc {
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
class IntelligibilityEnhancer {
public:
struct Config {
// TODO(bercic): the |decay_rate|, |analysis_rate| and |gain_limit|
// parameters should probably go away once fine tuning is done.
Config()
: sample_rate_hz(16000),
num_capture_channels(1),
num_render_channels(1),
decay_rate(0.9f),
analysis_rate(60),
gain_change_limit(0.1f),
rho(0.02f) {}
int sample_rate_hz;
size_t num_capture_channels;
size_t num_render_channels;
float decay_rate;
int analysis_rate;
float gain_change_limit;
float rho;
};
explicit IntelligibilityEnhancer(const Config& config);
IntelligibilityEnhancer(); // Initialize with default config.
IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);
// Sets the capture noise magnitude spectrum estimate.
void SetCaptureNoiseEstimate(std::vector<float> noise);
@ -86,9 +66,6 @@ class IntelligibilityEnhancer {
void ProcessClearBlock(const std::complex<float>* in_block,
std::complex<float>* out_block);
// Computes and sets modified gains.
void AnalyzeClearBlock();
// Bisection search for optimal |lambda|.
void SolveForLambda(float power_target, float power_bot, float power_top);
@ -105,29 +82,25 @@ class IntelligibilityEnhancer {
// Negative gains are set to 0. Stores the results in |sols|.
void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);
// Returns true if the audio is speech.
bool IsSpeech(const float* audio);
const size_t freqs_; // Num frequencies in frequency domain.
const size_t window_size_; // Window size in samples; also the block size.
const size_t chunk_length_; // Chunk size in samples.
const size_t bank_size_; // Num ERB filters.
const int sample_rate_hz_;
const int erb_resolution_;
const size_t num_capture_channels_;
const size_t num_render_channels_;
const int analysis_rate_; // Num blocks before gains recalculated.
const bool active_; // Whether render gains are being updated.
// TODO(ekm): Add logic for updating |active_|.
intelligibility::PowerEstimator clear_power_;
std::vector<float> noise_power_;
intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
std::unique_ptr<intelligibility::PowerEstimator<float>>
noise_power_estimator_;
std::unique_ptr<float[]> filtered_clear_pow_;
std::unique_ptr<float[]> filtered_noise_pow_;
std::unique_ptr<float[]> center_freqs_;
std::vector<std::vector<float>> capture_filter_bank_;
std::vector<std::vector<float>> render_filter_bank_;
size_t start_freq_;
std::unique_ptr<float[]> rho_; // Production and interpretation SNR.
// for each ERB band.
std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.
intelligibility::GainApplier gain_applier_;
@ -135,11 +108,13 @@ class IntelligibilityEnhancer {
// the original input array with modifications.
ChannelBuffer<float> temp_render_out_buffer_;
std::unique_ptr<float[]> kbd_window_;
TransformCallback render_callback_;
std::unique_ptr<LappedTransform> render_mangler_;
int block_count_;
int analysis_step_;
VoiceActivityDetector vad_;
std::vector<int16_t> audio_s16_;
size_t chunks_since_voice_;
bool is_speech_;
};
} // namespace webrtc

View File

@ -26,54 +26,184 @@ namespace {
// Target output for ERB create test. Generated with matlab.
const float kTestCenterFreqs[] = {
13.169f, 26.965f, 41.423f, 56.577f, 72.461f, 89.113f, 106.57f, 124.88f,
144.08f, 164.21f, 185.34f, 207.5f, 230.75f, 255.16f, 280.77f, 307.66f,
335.9f, 365.56f, 396.71f, 429.44f, 463.84f, 500.f};
const float kTestFilterBank[][9] = {
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.5f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.5f}};
14.5213f, 29.735f, 45.6781f, 62.3884f, 79.9058f, 98.2691f, 117.521f,
137.708f, 158.879f, 181.084f, 204.378f, 228.816f, 254.459f, 281.371f,
309.618f, 339.273f, 370.411f, 403.115f, 437.469f, 473.564f, 511.497f,
551.371f, 593.293f, 637.386f, 683.77f, 732.581f, 783.96f, 838.06f,
895.046f, 955.09f, 1018.38f, 1085.13f, 1155.54f, 1229.85f, 1308.32f,
1391.22f, 1478.83f, 1571.5f, 1669.55f, 1773.37f, 1883.37f, 2000.f};
const float kTestFilterBank[][33] = {
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.157895f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.210526f, 0.117647f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.315789f, 0.176471f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.315789f, 0.352941f, 0.142857f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.352941f, 0.285714f,
0.157895f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f,
0.210526f, 0.111111f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.285714f, 0.315789f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.315789f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f,
0.108108f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f,
0.243243f, 0.153846f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f,
0.324324f, 0.230769f, 0.166667f, 0.0909091f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.324324f, 0.307692f, 0.25f, 0.181818f, 0.0833333f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.307692f, 0.333333f,
0.363636f, 0.25f, 0.151515f, 0.0793651f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.166667f, 0.363636f, 0.333333f, 0.242424f,
0.190476f, 0.133333f, 0.0689655f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.333333f, 0.30303f, 0.253968f, 0.2f, 0.137931f,
0.0714286f, 0.f, 0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.30303f, 0.31746f, 0.333333f, 0.275862f, 0.214286f,
0.125f, 0.0655738f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.15873f, 0.333333f, 0.344828f, 0.357143f,
0.25f, 0.196721f, 0.137931f, 0.0816327f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.172414f, 0.357143f,
0.3125f, 0.245902f, 0.172414f, 0.102041f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.3125f, 0.327869f, 0.344828f, 0.204082f, 0.f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.163934f, 0.344828f, 0.408163f, 0.5f},
{0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.204082f, 0.5f}};
static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestFilterBank),
"Test filterbank badly initialized.");
// Target output for gain solving test. Generated with matlab.
const size_t kTestStartFreq = 12; // Lowest integral frequency for ERBs.
const float kTestZeroVar[] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
const float kTestZeroVar[] = {
1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0};
static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestZeroVar),
"Power test data badly initialized.");
const float kTestNonZeroVarLambdaTop[] = {
1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
1.f, 1.f, 1.f, 0.f, 0.f, 0.0351f, 0.0636f, 0.0863f,
0.1037f, 0.1162f, 0.1236f, 0.1251f, 0.1189f, 0.0993f};
1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0};
static_assert(arraysize(kTestCenterFreqs) ==
arraysize(kTestNonZeroVarLambdaTop),
"Power test data badly initialized.");
const float kMaxTestError = 0.005f;
// Enhancer initialization parameters.
const int kSamples = 2000;
const int kSampleRate = 1000;
const int kSamples = 1000;
const int kSampleRate = 4000;
const int kNumChannels = 1;
const int kFragmentSize = kSampleRate / 100;
@ -83,13 +213,11 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
protected:
IntelligibilityEnhancerTest()
: clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) {
config_.sample_rate_hz = kSampleRate;
enh_.reset(new IntelligibilityEnhancer(config_));
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
}
bool CheckUpdate() {
config_.sample_rate_hz = kSampleRate;
enh_.reset(new IntelligibilityEnhancer(config_));
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
float* clear_cursor = &clear_data_[0];
float* noise_cursor = &noise_data_[0];
for (int i = 0; i < kSamples; i += kFragmentSize) {
@ -105,7 +233,6 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
return false;
}
IntelligibilityEnhancer::Config config_;
std::unique_ptr<IntelligibilityEnhancer> enh_;
std::vector<float> clear_data_;
std::vector<float> noise_data_;
@ -115,9 +242,9 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
// For each class of generated data, tests that render stream is updated when
// it should be.
TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) {
std::fill(noise_data_.begin(), noise_data_.end(), 0.0f);
std::fill(orig_data_.begin(), orig_data_.end(), 0.0f);
std::fill(clear_data_.begin(), clear_data_.end(), 0.0f);
std::fill(noise_data_.begin(), noise_data_.end(), 0.f);
std::fill(orig_data_.begin(), orig_data_.end(), 0.f);
std::fill(clear_data_.begin(), clear_data_.end(), 0.f);
EXPECT_FALSE(CheckUpdate());
std::srand(1);
auto float_rand = []() { return std::rand() * 2.f / RAND_MAX - 1; };
@ -148,9 +275,8 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) {
std::vector<float> sols(enh_->bank_size_);
float lambda = -0.001f;
for (size_t i = 0; i < enh_->bank_size_; i++) {
enh_->filtered_clear_pow_[i] = 0.0f;
enh_->filtered_noise_pow_[i] = 0.0f;
enh_->rho_[i] = 0.02f;
enh_->filtered_clear_pow_[i] = 0.f;
enh_->filtered_noise_pow_[i] = 0.f;
}
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
for (size_t i = 0; i < enh_->bank_size_; i++) {
@ -164,7 +290,7 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) {
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
}
lambda = -1.0;
lambda = -1.f;
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestZeroVar[i], sols[i], kMaxTestError);

View File

@ -14,6 +14,7 @@
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <limits>
namespace webrtc {
@ -21,45 +22,38 @@ namespace intelligibility {
namespace {
// Return |current| changed towards |target|, with the change being at most
// |limit|.
// Return |current| changed towards |target|, with the relative change being at
// most |limit|.
float UpdateFactor(float target, float current, float limit) {
float delta = fabsf(target - current);
float sign = copysign(1.f, target - current);
return current + sign * fminf(delta, limit);
float gain = target / (current + std::numeric_limits<float>::epsilon());
if (gain < 1.f - limit) {
gain = 1.f - limit;
} else if (gain > 1.f + limit) {
gain = 1.f + limit;
}
return current * gain + std::numeric_limits<float>::epsilon();
}
} // namespace
PowerEstimator::PowerEstimator(size_t num_freqs,
float decay)
: magnitude_(new float[num_freqs]()),
power_(new float[num_freqs]()),
num_freqs_(num_freqs),
decay_(decay) {
memset(magnitude_.get(), 0, sizeof(*magnitude_.get()) * num_freqs_);
memset(power_.get(), 0, sizeof(*power_.get()) * num_freqs_);
}
template<typename T>
PowerEstimator<T>::PowerEstimator(size_t num_freqs, float decay)
: power_(num_freqs, 0.f), decay_(decay) {}
// Compute the magnitude from the beginning, with exponential decaying of the
// series data.
void PowerEstimator::Step(const std::complex<float>* data) {
for (size_t i = 0; i < num_freqs_; ++i) {
magnitude_[i] = decay_ * magnitude_[i] +
(1.f - decay_) * std::abs(data[i]);
template<typename T>
void PowerEstimator<T>::Step(const T* data) {
for (size_t i = 0; i < power_.size(); ++i) {
power_[i] = decay_ * power_[i] +
(1.f - decay_) * std::abs(data[i]) * std::abs(data[i]);
}
}
const float* PowerEstimator::Power() {
for (size_t i = 0; i < num_freqs_; ++i) {
power_[i] = magnitude_[i] * magnitude_[i];
}
return &power_[0];
}
template class PowerEstimator<float>;
template class PowerEstimator<std::complex<float>>;
GainApplier::GainApplier(size_t freqs, float change_limit)
GainApplier::GainApplier(size_t freqs, float relative_change_limit)
: num_freqs_(freqs),
change_limit_(change_limit),
relative_change_limit_(relative_change_limit),
target_(new float[freqs]()),
current_(new float[freqs]()) {
for (size_t i = 0; i < freqs; ++i) {
@ -71,12 +65,8 @@ GainApplier::GainApplier(size_t freqs, float change_limit)
void GainApplier::Apply(const std::complex<float>* in_block,
std::complex<float>* out_block) {
for (size_t i = 0; i < num_freqs_; ++i) {
float factor = sqrtf(fabsf(current_[i]));
if (!std::isnormal(factor)) {
factor = 1.f;
}
out_block[i] = factor * in_block[i];
current_[i] = UpdateFactor(target_[i], current_[i], change_limit_);
current_[i] = UpdateFactor(target_[i], current_[i], relative_change_limit_);
out_block[i] = sqrtf(fabsf(current_[i])) * in_block[i];
}
}

View File

@ -13,6 +13,7 @@
#include <complex>
#include <memory>
#include <vector>
namespace webrtc {
@ -21,6 +22,7 @@ namespace intelligibility {
// Internal helper for computing the power of a stream of arrays.
// The result is an array of power per position: the i-th power is the power of
// the stream of data on the i-th positions in the input arrays.
template <typename T>
class PowerEstimator {
public:
// Construct an instance for the given input array length (|freqs|), with the
@ -28,31 +30,24 @@ class PowerEstimator {
PowerEstimator(size_t freqs, float decay);
// Add a new data point to the series.
void Step(const std::complex<float>* data);
void Step(const T* data);
// The current power array.
const float* Power();
const std::vector<float>& power() { return power_; };
private:
// TODO(ekmeyerson): Switch the following running means
// and histories from std::unique_ptr to std::vector.
std::unique_ptr<std::complex<float>[]> running_mean_sq_;
// The current magnitude array.
std::unique_ptr<float[]> magnitude_;
// The current power array.
std::unique_ptr<float[]> power_;
std::vector<float> power_;
const size_t num_freqs_;
const float decay_;
};
// Helper class for smoothing gain changes. On each application step, the
// currently used gains are changed towards a set of settable target gains,
// constrained by a limit on the magnitude of the changes.
// constrained by a limit on the relative changes.
class GainApplier {
public:
GainApplier(size_t freqs, float change_limit);
GainApplier(size_t freqs, float relative_change_limit);
// Copy |in_block| to |out_block|, multiplied by the current set of gains,
// and step the current set of gains towards the target set.
@ -64,7 +59,7 @@ class GainApplier {
private:
const size_t num_freqs_;
const float change_limit_;
const float relative_change_limit_;
std::unique_ptr<float[]> target_;
std::unique_ptr<float[]> current_;
};

View File

@ -39,17 +39,16 @@ TEST(IntelligibilityUtilsTest, TestPowerEstimator) {
const float kDecay = 0.5f;
const std::vector<std::vector<std::complex<float>>> test_data(
GenerateTestData(kFreqs, kSamples));
PowerEstimator power_estimator(kFreqs, kDecay);
EXPECT_EQ(0, power_estimator.Power()[0]);
PowerEstimator<std::complex<float>> power_estimator(kFreqs, kDecay);
EXPECT_EQ(0, power_estimator.power()[0]);
// Makes sure Step is doing something.
power_estimator.Step(&test_data[0][0]);
for (size_t i = 1; i < kSamples; ++i) {
power_estimator.Step(&test_data[i][0]);
for (size_t j = 0; j < kFreqs; ++j) {
const float* power = power_estimator.Power();
EXPECT_GE(power[j], 0.f);
EXPECT_LE(power[j], 1.f);
EXPECT_GE(power_estimator.power()[j], 0.f);
EXPECT_LE(power_estimator.power()[j], 1.f);
}
}
}
@ -62,8 +61,8 @@ TEST(IntelligibilityUtilsTest, TestGainApplier) {
GainApplier gain_applier(kFreqs, kChangeLimit);
const std::vector<std::vector<std::complex<float>>> in_data(
GenerateTestData(kFreqs, kSamples));
std::vector<std::vector<std::complex<float>>> out_data(GenerateTestData(
kFreqs, kSamples));
std::vector<std::vector<std::complex<float>>> out_data(
GenerateTestData(kFreqs, kSamples));
for (size_t i = 0; i < kSamples; ++i) {
gain_applier.Apply(&in_data[i][0], &out_data[i][0]);
for (size_t j = 0; j < kFreqs; ++j) {

View File

@ -30,44 +30,24 @@ using std::complex;
namespace webrtc {
namespace {
DEFINE_double(clear_alpha, 0.9, "Power decay factor for clear data.");
DEFINE_int32(sample_rate,
16000,
"Audio sample rate used in the input and output files.");
DEFINE_int32(ana_rate,
60,
"Analysis rate; gains recalculated every N blocks.");
DEFINE_double(gain_limit, 1000.0, "Maximum gain change in one block.");
DEFINE_string(clear_file, "speech.wav", "Input file with clear speech.");
DEFINE_string(noise_file, "noise.wav", "Input file with noise data.");
DEFINE_string(out_file, "proc_enhanced.wav", "Enhanced output file.");
const size_t kNumChannels = 1;
// void function for gtest
void void_main(int argc, char* argv[]) {
google::SetUsageMessage(
"\n\nInput files must be little-endian 16-bit signed raw PCM.\n");
google::ParseCommandLineFlags(&argc, &argv, true);
size_t samples; // Number of samples in input PCM file
size_t fragment_size; // Number of samples to process at a time
// to simulate APM stream processing
// Load settings and wav input.
fragment_size = FLAGS_sample_rate / 100; // Mirror real time APM chunk size.
// Duplicates chunk_length_ in
// IntelligibilityEnhancer.
struct stat in_stat, noise_stat;
ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)
<< "Empty speech file.";
ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)
<< "Empty noise file.";
samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
WavReader in_file(FLAGS_clear_file);
std::vector<float> in_fpcm(samples);
@ -80,23 +60,19 @@ void void_main(int argc, char* argv[]) {
FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]);
// Run intelligibility enhancement.
IntelligibilityEnhancer::Config config;
config.sample_rate_hz = FLAGS_sample_rate;
config.decay_rate = static_cast<float>(FLAGS_clear_alpha);
config.analysis_rate = FLAGS_ana_rate;
config.gain_change_limit = FLAGS_gain_limit;
IntelligibilityEnhancer enh(config);
IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());
rtc::CriticalSection crit;
NoiseSuppressionImpl ns(&crit);
ns.Initialize(kNumChannels, FLAGS_sample_rate);
ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());
ns.Enable(true);
AudioBuffer capture_audio(fragment_size,
kNumChannels,
fragment_size,
kNumChannels,
// Mirror real time APM chunk size. Duplicates chunk_length_ in
// IntelligibilityEnhancer.
size_t fragment_size = in_file.sample_rate() / 100;
AudioBuffer capture_audio(fragment_size, noise_file.num_channels(),
fragment_size, noise_file.num_channels(),
fragment_size);
StreamConfig stream_config(FLAGS_sample_rate, kNumChannels);
StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels());
// Slice the input into smaller chunks, as the APM would do, and feed them
// through the enhancer.
@ -108,14 +84,17 @@ void void_main(int argc, char* argv[]) {
ns.AnalyzeCaptureAudio(&capture_audio);
ns.ProcessCaptureAudio(&capture_audio);
enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());
enh.ProcessRenderAudio(&clear_cursor, FLAGS_sample_rate, kNumChannels);
enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(),
in_file.num_channels());
clear_cursor += fragment_size;
noise_cursor += fragment_size;
}
FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]);
WavWriter out_file(FLAGS_out_file, FLAGS_sample_rate, kNumChannels);
WavWriter out_file(FLAGS_out_file,
in_file.sample_rate(),
in_file.num_channels());
out_file.WriteSamples(&in_fpcm[0], samples);
}

View File

@ -182,8 +182,8 @@ std::vector<float> NoiseSuppressionImpl::NoiseEstimate() {
for (auto& suppressor : suppressors_) {
const float* noise = WebRtcNs_noise_estimate(suppressor->state());
for (size_t i = 0; i < noise_estimate.size(); ++i) {
noise_estimate[i] += kNormalizationFactor *
noise[i] / suppressors_.size();
noise_estimate[i] +=
kNormalizationFactor * noise[i] / suppressors_.size();
}
}
#elif defined(WEBRTC_NS_FIXED)