
This generates bit-exact output. R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/37939004 Cr-Commit-Position: refs/heads/master@{#8254} git-svn-id: http://webrtc.googlecode.com/svn/trunk@8254 4adac7df-926f-26a2-2b94-8c16560cd09d
491 lines
18 KiB
C++
491 lines
18 KiB
C++
/*
|
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#define _USE_MATH_DEFINES
|
|
|
|
#include "webrtc/modules/audio_processing/beamformer/beamformer.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
|
|
#include "webrtc/common_audio/window_generator.h"
|
|
#include "webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.h"
|
|
|
|
namespace webrtc {
|
|
namespace {
|
|
|
|
// Alpha for the Kaiser Bessel Derived window.
|
|
const float kAlpha = 1.5f;
|
|
|
|
// The minimum value a post-processing mask can take.
|
|
const float kMaskMinimum = 0.01f;
|
|
|
|
const float kSpeedOfSoundMeterSeconds = 340;
|
|
|
|
// For both target and interference angles, 0 is perpendicular to the microphone
|
|
// array, facing forwards. The positive direction goes counterclockwise.
|
|
// The angle at which we amplify sound.
|
|
const float kTargetAngleRadians = 0.f;
|
|
|
|
// The angle at which we suppress sound. Suppression is symmetric around 0
|
|
// radians, so sound is suppressed at both +|kInterfAngleRadians| and
|
|
// -|kInterfAngleRadians|. Since the beamformer is robust, this should
|
|
// suppress sound coming from angles near +-|kInterfAngleRadians| as well.
|
|
const float kInterfAngleRadians = static_cast<float>(M_PI) / 4.f;
|
|
|
|
// When calculating the interference covariance matrix, this is the weight for
|
|
// the weighted average between the uniform covariance matrix and the angled
|
|
// covariance matrix.
|
|
// Rpsi = Rpsi_angled * kBalance + Rpsi_uniform * (1 - kBalance)
|
|
const float kBalance = 0.2f;
|
|
|
|
// TODO(claguna): need comment here.
|
|
const float kBeamwidthConstant = 0.00001f;
|
|
|
|
// Width of the boxcar.
|
|
const float kBoxcarHalfWidth = 0.001f;
|
|
|
|
// We put a gap in the covariance matrix where we expect the target to come
|
|
// from. Warning: This must be very small, ex. < 0.01, because otherwise it can
|
|
// cause the covariance matrix not to be positive semidefinite, and we require
|
|
// that our covariance matrices are positive semidefinite.
|
|
const float kCovUniformGapHalfWidth = 0.001f;
|
|
|
|
// Lower bound on gain decay.
|
|
const float kHalfLifeSeconds = 0.05f;
|
|
|
|
// The average mask is computed from masks in this mid-frequency range.
|
|
const int kMidFrequnecyLowerBoundHz = 250;
|
|
const int kMidFrequencyUpperBoundHz = 400;
|
|
|
|
const int kHighFrequencyLowerBoundHz = 4000;
|
|
const int kHighFrequencyUpperBoundHz = 7000;
|
|
|
|
// Mask threshold over which the data is considered signal and not interference.
|
|
const float kMaskTargetThreshold = 0.3f;
|
|
// Time in seconds after which the data is considered interference if the mask
|
|
// does not pass |kMaskTargetThreshold|.
|
|
const float kHoldTargetSeconds = 0.25f;
|
|
|
|
// Does conjugate(|norm_mat|) * |mat| * transpose(|norm_mat|). No extra space is
|
|
// used; to accomplish this, we compute both multiplications in the same loop.
|
|
float Norm(const ComplexMatrix<float>& mat,
|
|
const ComplexMatrix<float>& norm_mat) {
|
|
CHECK_EQ(norm_mat.num_rows(), 1);
|
|
CHECK_EQ(norm_mat.num_columns(), mat.num_rows());
|
|
CHECK_EQ(norm_mat.num_columns(), mat.num_columns());
|
|
|
|
complex<float> first_product = complex<float>(0.f, 0.f);
|
|
complex<float> second_product = complex<float>(0.f, 0.f);
|
|
|
|
const complex<float>* const* mat_els = mat.elements();
|
|
const complex<float>* const* norm_mat_els = norm_mat.elements();
|
|
|
|
for (int i = 0; i < norm_mat.num_columns(); ++i) {
|
|
for (int j = 0; j < norm_mat.num_columns(); ++j) {
|
|
complex<float> cur_norm_element = conj(norm_mat_els[0][j]);
|
|
complex<float> cur_mat_element = mat_els[j][i];
|
|
first_product += cur_norm_element * cur_mat_element;
|
|
}
|
|
second_product += first_product * norm_mat_els[0][i];
|
|
first_product = 0.f;
|
|
}
|
|
return second_product.real();
|
|
}
|
|
|
|
// Does conjugate(|lhs|) * |rhs| for row vectors |lhs| and |rhs|.
|
|
complex<float> ConjugateDotProduct(const ComplexMatrix<float>& lhs,
|
|
const ComplexMatrix<float>& rhs) {
|
|
CHECK_EQ(lhs.num_rows(), 1);
|
|
CHECK_EQ(rhs.num_rows(), 1);
|
|
CHECK_EQ(lhs.num_columns(), rhs.num_columns());
|
|
|
|
const complex<float>* const* lhs_elements = lhs.elements();
|
|
const complex<float>* const* rhs_elements = rhs.elements();
|
|
|
|
complex<float> result = complex<float>(0.f, 0.f);
|
|
for (int i = 0; i < lhs.num_columns(); ++i) {
|
|
result += conj(lhs_elements[0][i]) * rhs_elements[0][i];
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// Works for positive numbers only.
|
|
int Round(float x) {
|
|
return std::floor(x + 0.5f);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
Beamformer::Beamformer(const std::vector<Point>& array_geometry)
|
|
: num_input_channels_(array_geometry.size()),
|
|
mic_spacing_(MicSpacingFromGeometry(array_geometry)) {
|
|
|
|
WindowGenerator::KaiserBesselDerived(kAlpha, kFftSize, window_);
|
|
|
|
for (int i = 0; i < kNumberSavedPostfilterMasks; ++i) {
|
|
postfilter_masks_[i].Resize(1, kNumFreqBins);
|
|
}
|
|
}
|
|
|
|
void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
|
|
chunk_length_ = sample_rate_hz / (1000.f / chunk_size_ms);
|
|
sample_rate_hz_ = sample_rate_hz;
|
|
decay_threshold_ =
|
|
pow(2, (kFftSize / -2.f) / (sample_rate_hz_ * kHalfLifeSeconds));
|
|
mid_frequency_lower_bin_bound_ =
|
|
Round(kMidFrequnecyLowerBoundHz * kFftSize / sample_rate_hz_);
|
|
mid_frequency_upper_bin_bound_ =
|
|
Round(kMidFrequencyUpperBoundHz * kFftSize / sample_rate_hz_);
|
|
high_frequency_lower_bin_bound_ =
|
|
Round(kHighFrequencyLowerBoundHz * kFftSize / sample_rate_hz_);
|
|
high_frequency_upper_bin_bound_ =
|
|
Round(kHighFrequencyUpperBoundHz * kFftSize / sample_rate_hz_);
|
|
current_block_ix_ = 0;
|
|
previous_block_ix_ = -1;
|
|
is_target_present_ = false;
|
|
hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
|
|
interference_blocks_count_ = hold_target_blocks_;
|
|
|
|
DCHECK_LE(mid_frequency_upper_bin_bound_, kNumFreqBins);
|
|
DCHECK_LT(mid_frequency_lower_bin_bound_, mid_frequency_upper_bin_bound_);
|
|
DCHECK_LE(high_frequency_upper_bin_bound_, kNumFreqBins);
|
|
DCHECK_LT(high_frequency_lower_bin_bound_, high_frequency_upper_bin_bound_);
|
|
|
|
lapped_transform_.reset(new LappedTransform(num_input_channels_,
|
|
1,
|
|
chunk_length_,
|
|
window_,
|
|
kFftSize,
|
|
kFftSize / 2,
|
|
this));
|
|
|
|
for (int i = 0; i < kNumFreqBins; ++i) {
|
|
float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
|
|
wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
|
|
mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
|
|
kBeamwidthConstant * wave_numbers_[i] *
|
|
wave_numbers_[i];
|
|
}
|
|
|
|
// Initialize all nonadaptive values before looping through the frames.
|
|
InitDelaySumMasks();
|
|
InitTargetCovMats();
|
|
InitInterfCovMats();
|
|
|
|
for (int i = 0; i < kNumFreqBins; ++i) {
|
|
rxiws_[i] = Norm(target_cov_mats_[i], delay_sum_masks_[i]);
|
|
rpsiws_[i] = Norm(interf_cov_mats_[i], delay_sum_masks_[i]);
|
|
reflected_rpsiws_[i] =
|
|
Norm(reflected_interf_cov_mats_[i], delay_sum_masks_[i]);
|
|
}
|
|
}
|
|
|
|
void Beamformer::InitDelaySumMasks() {
|
|
float sin_target = sin(kTargetAngleRadians);
|
|
for (int f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
|
|
delay_sum_masks_[f_ix].Resize(1, num_input_channels_);
|
|
CovarianceMatrixGenerator::PhaseAlignmentMasks(f_ix,
|
|
kFftSize,
|
|
sample_rate_hz_,
|
|
kSpeedOfSoundMeterSeconds,
|
|
mic_spacing_,
|
|
num_input_channels_,
|
|
sin_target,
|
|
&delay_sum_masks_[f_ix]);
|
|
|
|
complex_f norm_factor = sqrt(
|
|
ConjugateDotProduct(delay_sum_masks_[f_ix], delay_sum_masks_[f_ix]));
|
|
delay_sum_masks_[f_ix].Scale(1.f / norm_factor);
|
|
}
|
|
}
|
|
|
|
void Beamformer::InitTargetCovMats() {
|
|
target_cov_mats_[0].Resize(num_input_channels_, num_input_channels_);
|
|
CovarianceMatrixGenerator::DCCovarianceMatrix(
|
|
num_input_channels_, kBoxcarHalfWidth, &target_cov_mats_[0]);
|
|
|
|
complex_f normalization_factor = target_cov_mats_[0].Trace();
|
|
target_cov_mats_[0].Scale(1.f / normalization_factor);
|
|
|
|
for (int i = 1; i < kNumFreqBins; ++i) {
|
|
target_cov_mats_[i].Resize(num_input_channels_, num_input_channels_);
|
|
CovarianceMatrixGenerator::Boxcar(wave_numbers_[i],
|
|
num_input_channels_,
|
|
mic_spacing_,
|
|
kBoxcarHalfWidth,
|
|
&target_cov_mats_[i]);
|
|
|
|
complex_f normalization_factor = target_cov_mats_[i].Trace();
|
|
target_cov_mats_[i].Scale(1.f / normalization_factor);
|
|
}
|
|
}
|
|
|
|
void Beamformer::InitInterfCovMats() {
|
|
interf_cov_mats_[0].Resize(num_input_channels_, num_input_channels_);
|
|
CovarianceMatrixGenerator::DCCovarianceMatrix(
|
|
num_input_channels_, kCovUniformGapHalfWidth, &interf_cov_mats_[0]);
|
|
|
|
complex_f normalization_factor = interf_cov_mats_[0].Trace();
|
|
interf_cov_mats_[0].Scale(1.f / normalization_factor);
|
|
reflected_interf_cov_mats_[0].PointwiseConjugate(interf_cov_mats_[0]);
|
|
for (int i = 1; i < kNumFreqBins; ++i) {
|
|
interf_cov_mats_[i].Resize(num_input_channels_, num_input_channels_);
|
|
ComplexMatrixF uniform_cov_mat(num_input_channels_, num_input_channels_);
|
|
ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_);
|
|
|
|
CovarianceMatrixGenerator::GappedUniformCovarianceMatrix(
|
|
wave_numbers_[i],
|
|
num_input_channels_,
|
|
mic_spacing_,
|
|
kCovUniformGapHalfWidth,
|
|
&uniform_cov_mat);
|
|
|
|
CovarianceMatrixGenerator::AngledCovarianceMatrix(kSpeedOfSoundMeterSeconds,
|
|
kInterfAngleRadians,
|
|
i,
|
|
kFftSize,
|
|
kNumFreqBins,
|
|
sample_rate_hz_,
|
|
num_input_channels_,
|
|
mic_spacing_,
|
|
&angled_cov_mat);
|
|
// Normalize matrices before averaging them.
|
|
complex_f normalization_factor = uniform_cov_mat.Trace();
|
|
uniform_cov_mat.Scale(1.f / normalization_factor);
|
|
normalization_factor = angled_cov_mat.Trace();
|
|
angled_cov_mat.Scale(1.f / normalization_factor);
|
|
|
|
// Average matrices.
|
|
uniform_cov_mat.Scale(1 - kBalance);
|
|
angled_cov_mat.Scale(kBalance);
|
|
interf_cov_mats_[i].Add(uniform_cov_mat, angled_cov_mat);
|
|
reflected_interf_cov_mats_[i].PointwiseConjugate(interf_cov_mats_[i]);
|
|
}
|
|
}
|
|
|
|
void Beamformer::ProcessChunk(const float* const* input,
|
|
const float* const* high_pass_split_input,
|
|
int num_input_channels,
|
|
int num_frames_per_band,
|
|
float* const* output,
|
|
float* const* high_pass_split_output) {
|
|
CHECK_EQ(num_input_channels, num_input_channels_);
|
|
CHECK_EQ(num_frames_per_band, chunk_length_);
|
|
|
|
num_blocks_in_this_chunk_ = 0;
|
|
float old_high_pass_mask = high_pass_postfilter_mask_;
|
|
high_pass_postfilter_mask_ = 0.f;
|
|
high_pass_exists_ = high_pass_split_input != NULL;
|
|
lapped_transform_->ProcessChunk(input, output);
|
|
|
|
// Apply delay and sum and post-filter in the time domain. WARNING: only works
|
|
// because delay-and-sum is not frequency dependent.
|
|
if (high_pass_exists_) {
|
|
high_pass_postfilter_mask_ /= num_blocks_in_this_chunk_;
|
|
|
|
if (previous_block_ix_ == -1) {
|
|
old_high_pass_mask = high_pass_postfilter_mask_;
|
|
}
|
|
|
|
// Ramp up/down for smoothing. 1 mask per 10ms results in audible
|
|
// discontinuities.
|
|
float ramp_inc =
|
|
(high_pass_postfilter_mask_ - old_high_pass_mask) / num_frames_per_band;
|
|
for (int i = 0; i < num_frames_per_band; ++i) {
|
|
old_high_pass_mask += ramp_inc;
|
|
|
|
// Applying the delay and sum (at zero degrees, this is equivalent to
|
|
// averaging).
|
|
float sum = 0.f;
|
|
for (int j = 0; j < num_input_channels; ++j) {
|
|
sum += high_pass_split_input[j][i];
|
|
}
|
|
high_pass_split_output[0][i] =
|
|
sum / num_input_channels * old_high_pass_mask;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Beamformer::ProcessAudioBlock(const complex_f* const* input,
|
|
int num_input_channels,
|
|
int num_freq_bins,
|
|
int num_output_channels,
|
|
complex_f* const* output) {
|
|
CHECK_EQ(num_freq_bins, kNumFreqBins);
|
|
CHECK_EQ(num_input_channels, num_input_channels_);
|
|
CHECK_EQ(num_output_channels, 1);
|
|
|
|
float* mask_data = postfilter_masks_[current_block_ix_].elements()[0];
|
|
|
|
// Calculating the post-filter masks. Note that we need two for each
|
|
// frequency bin to account for the positive and negative interferer
|
|
// angle.
|
|
for (int i = 0; i < kNumFreqBins; ++i) {
|
|
eig_m_.CopyFromColumn(input, i, num_input_channels_);
|
|
float eig_m_norm_factor =
|
|
std::sqrt(ConjugateDotProduct(eig_m_, eig_m_)).real();
|
|
if (eig_m_norm_factor != 0.f) {
|
|
eig_m_.Scale(1.f / eig_m_norm_factor);
|
|
}
|
|
|
|
float rxim = Norm(target_cov_mats_[i], eig_m_);
|
|
float ratio_rxiw_rxim = 0.f;
|
|
if (rxim != 0.f) {
|
|
ratio_rxiw_rxim = rxiws_[i] / rxim;
|
|
}
|
|
|
|
complex_f rmw = abs(ConjugateDotProduct(delay_sum_masks_[i], eig_m_));
|
|
rmw *= rmw;
|
|
float rmw_r = rmw.real();
|
|
|
|
mask_data[i] = CalculatePostfilterMask(interf_cov_mats_[i],
|
|
rpsiws_[i],
|
|
ratio_rxiw_rxim,
|
|
rmw_r,
|
|
mask_thresholds_[i]);
|
|
|
|
mask_data[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
|
|
reflected_rpsiws_[i],
|
|
ratio_rxiw_rxim,
|
|
rmw_r,
|
|
mask_thresholds_[i]);
|
|
}
|
|
|
|
EstimateTargetPresence(mask_data, kNumFreqBins);
|
|
|
|
// Can't access block_index - 1 on the first block.
|
|
if (previous_block_ix_ >= 0) {
|
|
ApplyDecay();
|
|
}
|
|
|
|
ApplyLowFrequencyCorrection();
|
|
|
|
if (high_pass_exists_) {
|
|
CalculateHighFrequencyMask();
|
|
}
|
|
|
|
ApplyMasks(input, output);
|
|
|
|
previous_block_ix_ = current_block_ix_;
|
|
current_block_ix_ = (current_block_ix_ + 1) % kNumberSavedPostfilterMasks;
|
|
num_blocks_in_this_chunk_++;
|
|
}
|
|
|
|
float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
|
|
float rpsiw,
|
|
float ratio_rxiw_rxim,
|
|
float rmw_r,
|
|
float mask_threshold) {
|
|
float rpsim = Norm(interf_cov_mat, eig_m_);
|
|
|
|
// Find lambda.
|
|
float ratio = rpsiw / rpsim;
|
|
float numerator = rmw_r - ratio;
|
|
float denominator = ratio_rxiw_rxim - ratio;
|
|
|
|
float mask = 1.f;
|
|
if (denominator > mask_threshold) {
|
|
float lambda = numerator / denominator;
|
|
mask = std::max(lambda * ratio_rxiw_rxim / rmw_r, kMaskMinimum);
|
|
}
|
|
return mask;
|
|
}
|
|
|
|
void Beamformer::ApplyMasks(const complex_f* const* input,
|
|
complex_f* const* output) {
|
|
complex_f* output_channel = output[0];
|
|
const float* postfilter_mask_els =
|
|
postfilter_masks_[current_block_ix_].elements()[0];
|
|
for (int f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
|
|
output_channel[f_ix] = complex_f(0.f, 0.f);
|
|
|
|
const complex_f* delay_sum_mask_els = delay_sum_masks_[f_ix].elements()[0];
|
|
for (int c_ix = 0; c_ix < num_input_channels_; ++c_ix) {
|
|
output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
|
|
}
|
|
|
|
output_channel[f_ix] *= postfilter_mask_els[f_ix];
|
|
}
|
|
}
|
|
|
|
void Beamformer::ApplyDecay() {
|
|
float* current_mask_els = postfilter_masks_[current_block_ix_].elements()[0];
|
|
const float* previous_block_els =
|
|
postfilter_masks_[previous_block_ix_].elements()[0];
|
|
for (int i = 0; i < kNumFreqBins; ++i) {
|
|
current_mask_els[i] =
|
|
std::max(current_mask_els[i], previous_block_els[i] * decay_threshold_);
|
|
}
|
|
}
|
|
|
|
void Beamformer::ApplyLowFrequencyCorrection() {
|
|
float low_frequency_mask = 0.f;
|
|
float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
|
|
for (int i = mid_frequency_lower_bin_bound_;
|
|
i <= mid_frequency_upper_bin_bound_;
|
|
++i) {
|
|
low_frequency_mask += mask_els[i];
|
|
}
|
|
|
|
low_frequency_mask /=
|
|
mid_frequency_upper_bin_bound_ - mid_frequency_lower_bin_bound_ + 1;
|
|
|
|
for (int i = 0; i < mid_frequency_lower_bin_bound_; ++i) {
|
|
mask_els[i] = low_frequency_mask;
|
|
}
|
|
}
|
|
|
|
void Beamformer::CalculateHighFrequencyMask() {
|
|
float high_pass_mask = 0.f;
|
|
float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
|
|
for (int i = high_frequency_lower_bin_bound_;
|
|
i <= high_frequency_upper_bin_bound_;
|
|
++i) {
|
|
high_pass_mask += mask_els[i];
|
|
}
|
|
|
|
high_pass_mask /=
|
|
high_frequency_upper_bin_bound_ - high_frequency_lower_bin_bound_ + 1;
|
|
|
|
high_pass_postfilter_mask_ += high_pass_mask;
|
|
}
|
|
|
|
// This method CHECKs for a uniform linear array.
|
|
float Beamformer::MicSpacingFromGeometry(const std::vector<Point>& geometry) {
|
|
CHECK_GE(geometry.size(), 2u);
|
|
float mic_spacing = 0.f;
|
|
for (size_t i = 0u; i < 3u; ++i) {
|
|
float difference = geometry[1].c[i] - geometry[0].c[i];
|
|
for (size_t j = 2u; j < geometry.size(); ++j) {
|
|
CHECK_LT(geometry[j].c[i] - geometry[j - 1].c[i] - difference, 1e-6);
|
|
}
|
|
mic_spacing += difference * difference;
|
|
}
|
|
return sqrt(mic_spacing);
|
|
}
|
|
|
|
void Beamformer::EstimateTargetPresence(float* mask, int length) {
|
|
memcpy(sorted_mask_, mask, kNumFreqBins * sizeof(*mask));
|
|
const int median_ix = (length + 1) / 2;
|
|
std::nth_element(sorted_mask_,
|
|
sorted_mask_ + median_ix,
|
|
sorted_mask_ + length);
|
|
if (sorted_mask_[median_ix] > kMaskTargetThreshold) {
|
|
is_target_present_ = true;
|
|
interference_blocks_count_ = 0;
|
|
} else {
|
|
is_target_present_ = interference_blocks_count_++ < hold_target_blocks_;
|
|
}
|
|
}
|
|
|
|
} // namespace webrtc
|