Fix the stereo support in IntelligibilityEnhancer

Review URL: https://codereview.webrtc.org/1729753003

Cr-Commit-Position: refs/heads/master@{#11795}
This commit is contained in:
aluebs
2016-02-26 17:17:38 -08:00
committed by Commit bot
parent 7ffeab525c
commit 0a00759780
7 changed files with 92 additions and 161 deletions

View File

@ -54,29 +54,12 @@ void MapToErbBands(const float* pow,
float* result) {
for (size_t i = 0; i < filter_bank.size(); ++i) {
RTC_DCHECK_GT(filter_bank[i].size(), 0u);
result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());
result[i] = DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());
}
}
} // namespace
IntelligibilityEnhancer::TransformCallback::TransformCallback(
IntelligibilityEnhancer* parent)
: parent_(parent) {
}
void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
const std::complex<float>* const* in_block,
size_t in_channels,
size_t frames,
size_t /* out_channels */,
std::complex<float>* const* out_block) {
RTC_DCHECK_EQ(parent_->freqs_, frames);
for (size_t i = 0; i < in_channels; ++i) {
parent_->ProcessClearBlock(in_block[i], out_block[i]);
}
}
IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
size_t num_render_channels)
: freqs_(RealFourier::ComplexLength(
@ -88,24 +71,17 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
clear_power_estimator_(freqs_, kDecayRate),
noise_power_estimator_(
new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
filtered_clear_pow_(new float[bank_size_]),
filtered_noise_pow_(new float[bank_size_]),
center_freqs_(new float[bank_size_]),
filtered_clear_pow_(bank_size_, 0.f),
filtered_noise_pow_(bank_size_, 0.f),
center_freqs_(bank_size_),
render_filter_bank_(CreateErbBank(freqs_)),
gains_eq_(new float[bank_size_]),
gains_eq_(bank_size_),
gain_applier_(freqs_, kMaxRelativeGainChange),
temp_render_out_buffer_(chunk_length_, num_render_channels_),
render_callback_(this),
audio_s16_(chunk_length_),
chunks_since_voice_(kSpeechOffsetDelay),
is_speech_(false) {
RTC_DCHECK_LE(kRho, 1.f);
memset(filtered_clear_pow_.get(), 0,
bank_size_ * sizeof(filtered_clear_pow_[0]));
memset(filtered_noise_pow_.get(), 0,
bank_size_ * sizeof(filtered_noise_pow_[0]));
const size_t erb_index = static_cast<size_t>(
ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
43.f));
@ -113,10 +89,11 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
std::vector<float> kbd_window(window_size);
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,
kbd_window.data());
render_mangler_.reset(new LappedTransform(
num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
window_size, window_size / 2, &render_callback_));
num_render_channels_, num_render_channels_, chunk_length_,
kbd_window.data(), window_size, window_size / 2, this));
}
void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
@ -127,7 +104,7 @@ void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
noise_power_estimator_.reset(
new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
}
noise_power_estimator_->Step(&noise[0]);
noise_power_estimator_->Step(noise.data());
}
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
@ -136,38 +113,40 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
RTC_CHECK_EQ(num_render_channels_, num_channels);
is_speech_ = IsSpeech(audio[0]);
render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
for (size_t i = 0; i < num_render_channels_; ++i) {
memcpy(audio[i], temp_render_out_buffer_.channels()[i],
chunk_length_ * sizeof(**audio));
}
render_mangler_->ProcessChunk(audio, audio);
}
void IntelligibilityEnhancer::ProcessClearBlock(
const std::complex<float>* in_block,
std::complex<float>* out_block) {
void IntelligibilityEnhancer::ProcessAudioBlock(
const std::complex<float>* const* in_block,
size_t in_channels,
size_t frames,
size_t /* out_channels */,
std::complex<float>* const* out_block) {
RTC_DCHECK_EQ(freqs_, frames);
if (is_speech_) {
clear_power_estimator_.Step(in_block);
clear_power_estimator_.Step(in_block[0]);
}
const std::vector<float>& clear_power = clear_power_estimator_.power();
const std::vector<float>& noise_power = noise_power_estimator_->power();
MapToErbBands(&clear_power[0], render_filter_bank_,
filtered_clear_pow_.get());
MapToErbBands(&noise_power[0], capture_filter_bank_,
filtered_noise_pow_.get());
SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
MapToErbBands(clear_power.data(), render_filter_bank_,
filtered_clear_pow_.data());
MapToErbBands(noise_power.data(), capture_filter_bank_,
filtered_noise_pow_.data());
SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());
const float power_target =
std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
std::accumulate(clear_power.data(), clear_power.data() + freqs_, 0.f);
const float power_top =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());
const float power_bot =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
if (power_target >= power_bot && power_target <= power_top) {
SolveForLambda(power_target);
UpdateErbGains();
} // Else experiencing power underflow, so do nothing.
gain_applier_.Apply(in_block, out_block);
for (size_t i = 0; i < in_channels; ++i) {
gain_applier_.Apply(in_block[i], out_block[i]);
}
}
void IntelligibilityEnhancer::SolveForLambda(float power_target) {
@ -182,9 +161,9 @@ void IntelligibilityEnhancer::SolveForLambda(float power_target) {
int iters = 0;
while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
const float lambda = (lambda_bot + lambda_top) / 2.f;
SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data());
const float power =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
if (power < power_target) {
lambda_bot = lambda;
} else {
@ -286,8 +265,8 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
float* sols) {
const float kMinPower = 1e-5f;
const float* pow_x0 = filtered_clear_pow_.get();
const float* pow_n0 = filtered_noise_pow_.get();
const float* pow_x0 = filtered_clear_pow_.data();
const float* pow_n0 = filtered_noise_pow_.data();
for (size_t n = 0; n < start_freq; ++n) {
sols[n] = 1.f;
@ -316,8 +295,8 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
}
bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
FloatToS16(audio, chunk_length_, &audio_s16_[0]);
vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
FloatToS16(audio, chunk_length_, audio_s16_.data());
vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);
if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
chunks_since_voice_ = 0;
} else if (chunks_since_voice_ < kSpeechOffsetDelay) {

View File

@ -27,7 +27,7 @@ namespace webrtc {
// frequency bin to enhance speech against the noise background.
// Details of the model and algorithm can be found in the original paper:
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
class IntelligibilityEnhancer {
class IntelligibilityEnhancer : public LappedTransform::Callback {
public:
IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);
@ -40,32 +40,19 @@ class IntelligibilityEnhancer {
size_t num_channels);
bool active() const;
protected:
// All in frequency domain, receives input |in_block|, applies
// intelligibility enhancement, and writes result to |out_block|.
void ProcessAudioBlock(const std::complex<float>* const* in_block,
size_t in_channels,
size_t frames,
size_t out_channels,
std::complex<float>* const* out_block) override;
private:
// Provides access point to the frequency domain.
class TransformCallback : public LappedTransform::Callback {
public:
TransformCallback(IntelligibilityEnhancer* parent);
// All in frequency domain, receives input |in_block|, applies
// intelligibility enhancement, and writes result to |out_block|.
void ProcessAudioBlock(const std::complex<float>* const* in_block,
size_t in_channels,
size_t frames,
size_t out_channels,
std::complex<float>* const* out_block) override;
private:
IntelligibilityEnhancer* parent_;
};
friend class TransformCallback;
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);
// Updates power computation and analysis with |in_block_|,
// and writes modified speech to |out_block|.
void ProcessClearBlock(const std::complex<float>* in_block,
std::complex<float>* out_block);
// Bisection search for optimal |lambda|.
void SolveForLambda(float power_target);
@ -94,21 +81,16 @@ class IntelligibilityEnhancer {
intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
std::unique_ptr<intelligibility::PowerEstimator<float>>
noise_power_estimator_;
std::unique_ptr<float[]> filtered_clear_pow_;
std::unique_ptr<float[]> filtered_noise_pow_;
std::unique_ptr<float[]> center_freqs_;
std::vector<float> filtered_clear_pow_;
std::vector<float> filtered_noise_pow_;
std::vector<float> center_freqs_;
std::vector<std::vector<float>> capture_filter_bank_;
std::vector<std::vector<float>> render_filter_bank_;
size_t start_freq_;
std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.
std::vector<float> gains_eq_; // Pre-filter modified gains.
intelligibility::GainApplier gain_applier_;
// Destination buffers used to reassemble blocked chunks before overwriting
// the original input array with modifications.
ChannelBuffer<float> temp_render_out_buffer_;
TransformCallback render_callback_;
std::unique_ptr<LappedTransform> render_mangler_;
VoiceActivityDetector vad_;

View File

@ -213,8 +213,8 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
bool CheckUpdate() {
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
float* clear_cursor = &clear_data_[0];
float* noise_cursor = &noise_data_[0];
float* clear_cursor = clear_data_.data();
float* noise_cursor = noise_data_.data();
for (int i = 0; i < kSamples; i += kFragmentSize) {
enh_->ProcessRenderAudio(&clear_cursor, kSampleRate, kNumChannels);
clear_cursor += kFragmentSize;
@ -273,7 +273,7 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) {
enh_->filtered_clear_pow_[i] = 0.f;
enh_->filtered_noise_pow_[i] = 0.f;
}
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data());
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestZeroVar, sols[i], kMaxTestError);
}
@ -281,12 +281,12 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) {
enh_->filtered_clear_pow_[i] = static_cast<float>(i + 1);
enh_->filtered_noise_pow_[i] = static_cast<float>(enh_->bank_size_ - i);
}
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data());
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
}
lambda = -1.f;
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data());
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
}

View File

@ -54,13 +54,8 @@ template class PowerEstimator<std::complex<float>>;
GainApplier::GainApplier(size_t freqs, float relative_change_limit)
: num_freqs_(freqs),
relative_change_limit_(relative_change_limit),
target_(new float[freqs]()),
current_(new float[freqs]()) {
for (size_t i = 0; i < freqs; ++i) {
target_[i] = 1.f;
current_[i] = 1.f;
}
}
target_(freqs, 1.f),
current_(freqs, 1.f) {}
void GainApplier::Apply(const std::complex<float>* in_block,
std::complex<float>* out_block) {

View File

@ -12,7 +12,6 @@
#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_
#include <complex>
#include <memory>
#include <vector>
namespace webrtc {
@ -55,13 +54,13 @@ class GainApplier {
std::complex<float>* out_block);
// Return the current target gain set. Modify this array to set the targets.
float* target() const { return target_.get(); }
float* target() { return target_.data(); }
private:
const size_t num_freqs_;
const float relative_change_limit_;
std::unique_ptr<float[]> target_;
std::unique_ptr<float[]> current_;
std::vector<float> target_;
std::vector<float> current_;
};
} // namespace intelligibility

View File

@ -43,9 +43,9 @@ TEST(IntelligibilityUtilsTest, TestPowerEstimator) {
EXPECT_EQ(0, power_estimator.power()[0]);
// Makes sure Step is doing something.
power_estimator.Step(&test_data[0][0]);
power_estimator.Step(test_data[0].data());
for (size_t i = 1; i < kSamples; ++i) {
power_estimator.Step(&test_data[i][0]);
power_estimator.Step(test_data[i].data());
for (size_t j = 0; j < kFreqs; ++j) {
EXPECT_GE(power_estimator.power()[j], 0.f);
EXPECT_LE(power_estimator.power()[j], 1.f);
@ -64,7 +64,7 @@ TEST(IntelligibilityUtilsTest, TestGainApplier) {
std::vector<std::vector<std::complex<float>>> out_data(
GenerateTestData(kFreqs, kSamples));
for (size_t i = 0; i < kSamples; ++i) {
gain_applier.Apply(&in_data[i][0], &out_data[i][0]);
gain_applier.Apply(in_data[i].data(), out_data[i].data());
for (size_t j = 0; j < kFreqs; ++j) {
EXPECT_GT(out_data[i][j].real(), 0.f);
EXPECT_LT(out_data[i][j].real(), 1.f);

View File

@ -8,17 +8,10 @@
* be found in the AUTHORS file in the root of the source tree.
*/
//
// Command line tool for speech intelligibility enhancement. Provides for
// running and testing intelligibility_enhancer as an independent process.
// Use --help for options.
//
#include <sys/stat.h>
#include "gflags/gflags.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "webrtc/base/criticalsection.h"
#include "webrtc/common_audio/channel_buffer.h"
#include "webrtc/common_audio/include/audio_util.h"
#include "webrtc/common_audio/wav_file.h"
#include "webrtc/modules/audio_processing/audio_buffer.h"
@ -40,62 +33,45 @@ void void_main(int argc, char* argv[]) {
"\n\nInput files must be little-endian 16-bit signed raw PCM.\n");
google::ParseCommandLineFlags(&argc, &argv, true);
// Load settings and wav input.
struct stat in_stat, noise_stat;
ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)
<< "Empty speech file.";
ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)
<< "Empty noise file.";
const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
WavReader in_file(FLAGS_clear_file);
std::vector<float> in_fpcm(samples);
in_file.ReadSamples(samples, &in_fpcm[0]);
FloatS16ToFloat(&in_fpcm[0], samples, &in_fpcm[0]);
WavReader noise_file(FLAGS_noise_file);
std::vector<float> noise_fpcm(samples);
noise_file.ReadSamples(samples, &noise_fpcm[0]);
FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]);
// Run intelligibility enhancement.
WavWriter out_file(FLAGS_out_file, in_file.sample_rate(),
in_file.num_channels());
IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());
rtc::CriticalSection crit;
NoiseSuppressionImpl ns(&crit);
ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());
ns.Enable(true);
// Mirror real time APM chunk size. Duplicates chunk_length_ in
// IntelligibilityEnhancer.
size_t fragment_size = in_file.sample_rate() / 100;
AudioBuffer capture_audio(fragment_size, noise_file.num_channels(),
fragment_size, noise_file.num_channels(),
fragment_size);
StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels());
// Slice the input into smaller chunks, as the APM would do, and feed them
// through the enhancer.
float* clear_cursor = &in_fpcm[0];
float* noise_cursor = &noise_fpcm[0];
for (size_t i = 0; i < samples; i += fragment_size) {
capture_audio.CopyFrom(&noise_cursor, stream_config);
const size_t in_samples = noise_file.sample_rate() / 100;
const size_t noise_samples = noise_file.sample_rate() / 100;
std::vector<float> in(in_samples * in_file.num_channels());
std::vector<float> noise(noise_samples * noise_file.num_channels());
ChannelBuffer<float> in_buf(in_samples, in_file.num_channels());
ChannelBuffer<float> noise_buf(noise_samples, noise_file.num_channels());
AudioBuffer capture_audio(noise_samples, noise_file.num_channels(),
noise_samples, noise_file.num_channels(),
noise_samples);
StreamConfig stream_config(noise_file.sample_rate(),
noise_file.num_channels());
while (in_file.ReadSamples(in.size(), in.data()) == in.size() &&
noise_file.ReadSamples(noise.size(), noise.data()) == noise.size()) {
FloatS16ToFloat(in.data(), in.size(), in.data());
FloatS16ToFloat(noise.data(), noise.size(), noise.data());
Deinterleave(in.data(), in_buf.num_frames(), in_buf.num_channels(),
in_buf.channels());
Deinterleave(noise.data(), noise_buf.num_frames(), noise_buf.num_channels(),
noise_buf.channels());
capture_audio.CopyFrom(noise_buf.channels(), stream_config);
ns.AnalyzeCaptureAudio(&capture_audio);
ns.ProcessCaptureAudio(&capture_audio);
enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());
enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(),
enh.ProcessRenderAudio(in_buf.channels(), in_file.sample_rate(),
in_file.num_channels());
clear_cursor += fragment_size;
noise_cursor += fragment_size;
Interleave(in_buf.channels(), in_buf.num_frames(), in_buf.num_channels(),
in.data());
FloatToFloatS16(in.data(), in.size(), in.data());
out_file.WriteSamples(in.data(), in.size());
}
FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]);
WavWriter out_file(FLAGS_out_file,
in_file.sample_rate(),
in_file.num_channels());
out_file.WriteSamples(&in_fpcm[0], samples);
}
} // namespace