RNN VAD: Opus band spectral analysis refactoring
This CL refactors the computation of band energy and spectral cross-correlation coefficients by moving and optimizing the code from ComputeBandCoefficients, ComputeBandEnergies and ComputeSpectralCrossCorrelation into a single class (named BandFeaturesExtractor). This change will also help replacing FFT library in the RNN VAD. Bug: webrtc:10480 Change-Id: I6cefa23e8f3bc8de6eb09d3ea434699d5e19124e Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/129726 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Per Åhgren <peah@webrtc.org> Cr-Commit-Position: refs/heads/master@{#27535}
This commit is contained in:
committed by
Commit Bot
parent
d93a004086
commit
4a53766c84
@ -38,7 +38,6 @@ rtc_source_set("rnn_vad") {
|
||||
deps = [
|
||||
"..:biquad_filter",
|
||||
"../../../../api:array_view",
|
||||
"../../../../api:function_view",
|
||||
"../../../../rtc_base:checks",
|
||||
"../../../../rtc_base:rtc_base_approved",
|
||||
"../../utility:pffft_wrapper",
|
||||
|
||||
@ -52,17 +52,13 @@ constexpr size_t kNumInvertedLags12kHz = kMaxPitch12kHz - kInitialMinPitch12kHz;
|
||||
constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2;
|
||||
constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2;
|
||||
|
||||
// Sub-band frequency boundaries.
|
||||
// Spectral features.
|
||||
constexpr size_t kFftSizeBy2Plus1 = kFrameSize20ms24kHz / 2 + 1;
|
||||
constexpr size_t kNumBands = 22;
|
||||
constexpr int kBandFrequencyBoundaries[kNumBands] = {
|
||||
0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400,
|
||||
2800, 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
|
||||
|
||||
// Feature extraction parameters.
|
||||
constexpr size_t kNumLowerBands = 6;
|
||||
static_assert((0 < kNumLowerBands) && (kNumLowerBands < kNumBands), "");
|
||||
constexpr size_t kSpectralCoeffsHistorySize = 8;
|
||||
static_assert(kSpectralCoeffsHistorySize > 2,
|
||||
constexpr size_t kCepstralCoeffsHistorySize = 8;
|
||||
static_assert(kCepstralCoeffsHistorySize > 2,
|
||||
"The history size must at least be 3 to compute first and second "
|
||||
"derivatives.");
|
||||
|
||||
|
||||
@ -78,12 +78,12 @@ bool FeaturesExtractor::CheckSilenceComputeFeatures(
|
||||
// and write the feature vector.
|
||||
return spectral_features_extractor_.CheckSilenceComputeFeatures(
|
||||
reference_frame_view_, {lagged_frame.data(), kFrameSize20ms24kHz},
|
||||
{{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
|
||||
{feature_vector.data(), kNumLowerBands},
|
||||
{feature_vector.data() + kNumBands, kNumLowerBands},
|
||||
{feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
|
||||
{feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
|
||||
&feature_vector[kFeatureVectorSize - 1]});
|
||||
{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
|
||||
{feature_vector.data(), kNumLowerBands},
|
||||
{feature_vector.data() + kNumBands, kNumLowerBands},
|
||||
{feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
|
||||
{feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
|
||||
&feature_vector[kFeatureVectorSize - 1]);
|
||||
}
|
||||
|
||||
} // namespace rnn_vad
|
||||
|
||||
@ -35,16 +35,16 @@ std::array<float, kHalfFrameSize> ComputeHalfVorbisWindow() {
|
||||
|
||||
} // namespace
|
||||
|
||||
BandAnalysisFft::BandAnalysisFft()
|
||||
FftUtil::FftUtil()
|
||||
: half_window_(ComputeHalfVorbisWindow()),
|
||||
fft_(static_cast<int>(input_buf_.size())) {}
|
||||
|
||||
BandAnalysisFft::~BandAnalysisFft() = default;
|
||||
FftUtil::~FftUtil() = default;
|
||||
|
||||
void BandAnalysisFft::ForwardFft(rtc::ArrayView<const float> samples,
|
||||
rtc::ArrayView<std::complex<float>> dst) {
|
||||
void FftUtil::WindowedFft(rtc::ArrayView<const float> samples,
|
||||
rtc::ArrayView<std::complex<float>> dst) {
|
||||
RTC_DCHECK_EQ(samples.size(), kFrameSize20ms24kHz);
|
||||
RTC_DCHECK_EQ(dst.size(), kFrameSize20ms24kHz / 2 + 1);
|
||||
RTC_DCHECK_EQ(dst.size(), kFftSizeBy2Plus1);
|
||||
// Apply windowing.
|
||||
RTC_DCHECK_EQ(input_buf_.size(), 2 * half_window_.size());
|
||||
for (size_t i = 0; i < input_buf_.size() / 2; ++i) {
|
||||
|
||||
@ -21,32 +21,31 @@
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
|
||||
// TODO(alessiob): Switch to PFFFT using its own wrapper.
|
||||
// TODO(alessiob): Delete this class when switching to PFFFT.
|
||||
// TODO(alessiob): Switch to PFFFT and remove this class.
|
||||
// FFT implementation wrapper for the band-wise analysis step in which 20 ms
|
||||
// frames at 24 kHz are analyzed in the frequency domain. The goal of this class
|
||||
// are (i) making easy to switch to another FFT implementation, (ii) own the
|
||||
// input buffer for the FFT and (iii) apply a windowing function before
|
||||
// computing the FFT.
|
||||
class BandAnalysisFft {
|
||||
class FftUtil {
|
||||
public:
|
||||
BandAnalysisFft();
|
||||
BandAnalysisFft(const BandAnalysisFft&) = delete;
|
||||
BandAnalysisFft& operator=(const BandAnalysisFft&) = delete;
|
||||
~BandAnalysisFft();
|
||||
FftUtil();
|
||||
FftUtil(const FftUtil&) = delete;
|
||||
FftUtil& operator=(const FftUtil&) = delete;
|
||||
~FftUtil();
|
||||
// Applies a windowing function to |samples|, computes the real forward FFT
|
||||
// and writes the result in |dst|.
|
||||
// The size of |samples| must be 480 (20 ms at 24 kHz).
|
||||
// The size of |dst| must be 241 since the complex conjugate is not written.
|
||||
void ForwardFft(rtc::ArrayView<const float> samples,
|
||||
rtc::ArrayView<std::complex<float>> dst);
|
||||
void WindowedFft(rtc::ArrayView<const float> samples,
|
||||
rtc::ArrayView<std::complex<float>> dst);
|
||||
|
||||
private:
|
||||
static_assert((kFrameSize20ms24kHz & 1) == 0,
|
||||
"kFrameSize20ms24kHz must be even.");
|
||||
const std::array<float, kFrameSize20ms24kHz / 2> half_window_;
|
||||
std::array<std::complex<float>, kFrameSize20ms24kHz> input_buf_{};
|
||||
std::array<std::complex<float>, kFrameSize20ms24kHz> output_buf_{};
|
||||
std::array<std::complex<float>, kFrameSize20ms24kHz> input_buf_;
|
||||
std::array<std::complex<float>, kFrameSize20ms24kHz> output_buf_;
|
||||
rnnoise::KissFft fft_;
|
||||
};
|
||||
|
||||
|
||||
@ -39,16 +39,16 @@ std::vector<float> CreateSine(float amplitude,
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(RnnVadTest, BandAnalysisFftTest) {
|
||||
TEST(RnnVadTest, FftUtilTest) {
|
||||
for (float frequency_hz : {200.f, 450.f, 1500.f}) {
|
||||
SCOPED_TRACE(frequency_hz);
|
||||
auto x = CreateSine(
|
||||
/*amplitude=*/1000.f, frequency_hz,
|
||||
/*duration_s=*/0.02f,
|
||||
/*sample_rate_hz=*/kSampleRate24kHz);
|
||||
BandAnalysisFft analyzer;
|
||||
FftUtil analyzer;
|
||||
std::vector<std::complex<float>> x_fft(x.size() / 2 + 1);
|
||||
analyzer.ForwardFft(x, x_fft);
|
||||
analyzer.WindowedFft(x, x_fft);
|
||||
int peak_fft_bin_index = std::distance(
|
||||
x_fft.begin(),
|
||||
std::max_element(x_fft.begin(), x_fft.end(),
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
|
||||
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
|
||||
#include "rtc_base/checks.h"
|
||||
|
||||
namespace webrtc {
|
||||
@ -24,21 +23,21 @@ namespace {
|
||||
|
||||
constexpr float kSilenceThreshold = 0.04f;
|
||||
|
||||
// Computes the new spectral difference stats and pushes them into the passed
|
||||
// Computes the new cepstral difference stats and pushes them into the passed
|
||||
// symmetric matrix buffer.
|
||||
void UpdateSpectralDifferenceStats(
|
||||
rtc::ArrayView<const float, kNumBands> new_spectral_coeffs,
|
||||
const RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>& ring_buf,
|
||||
SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize>* sym_matrix_buf) {
|
||||
void UpdateCepstralDifferenceStats(
|
||||
rtc::ArrayView<const float, kNumBands> new_cepstral_coeffs,
|
||||
const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf,
|
||||
SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) {
|
||||
RTC_DCHECK(sym_matrix_buf);
|
||||
// Compute the new spectral distance stats.
|
||||
std::array<float, kSpectralCoeffsHistorySize - 1> distances;
|
||||
for (size_t i = 0; i < kSpectralCoeffsHistorySize - 1; ++i) {
|
||||
// Compute the new cepstral distance stats.
|
||||
std::array<float, kCepstralCoeffsHistorySize - 1> distances;
|
||||
for (size_t i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) {
|
||||
const size_t delay = i + 1;
|
||||
auto old_spectral_coeffs = ring_buf.GetArrayView(delay);
|
||||
auto old_cepstral_coeffs = ring_buf.GetArrayView(delay);
|
||||
distances[i] = 0.f;
|
||||
for (size_t k = 0; k < kNumBands; ++k) {
|
||||
const float c = new_spectral_coeffs[k] - old_spectral_coeffs[k];
|
||||
const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k];
|
||||
distances[i] += c * c;
|
||||
}
|
||||
}
|
||||
@ -48,96 +47,77 @@ void UpdateSpectralDifferenceStats(
|
||||
|
||||
} // namespace
|
||||
|
||||
SpectralFeaturesView::SpectralFeaturesView(
|
||||
rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
|
||||
rtc::ArrayView<float, kNumLowerBands> average,
|
||||
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> cross_correlations,
|
||||
float* variability)
|
||||
: coeffs(coeffs),
|
||||
average(average),
|
||||
first_derivative(first_derivative),
|
||||
second_derivative(second_derivative),
|
||||
cross_correlations(cross_correlations),
|
||||
variability(variability) {}
|
||||
|
||||
SpectralFeaturesView::SpectralFeaturesView(const SpectralFeaturesView&) =
|
||||
default;
|
||||
SpectralFeaturesView::~SpectralFeaturesView() = default;
|
||||
|
||||
SpectralFeaturesExtractor::SpectralFeaturesExtractor()
|
||||
: fft_(),
|
||||
reference_frame_fft_(kFrameSize20ms24kHz / 2 + 1),
|
||||
lagged_frame_fft_(kFrameSize20ms24kHz / 2 + 1),
|
||||
band_boundaries_(
|
||||
ComputeBandBoundaryIndexes(kSampleRate24kHz, kFrameSize20ms24kHz)),
|
||||
reference_frame_fft_(kFftSizeBy2Plus1),
|
||||
lagged_frame_fft_(kFftSizeBy2Plus1),
|
||||
dct_table_(ComputeDctTable()) {}
|
||||
|
||||
SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default;
|
||||
|
||||
void SpectralFeaturesExtractor::Reset() {
|
||||
spectral_coeffs_ring_buf_.Reset();
|
||||
spectral_diffs_buf_.Reset();
|
||||
cepstral_coeffs_ring_buf_.Reset();
|
||||
cepstral_diffs_buf_.Reset();
|
||||
}
|
||||
|
||||
bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures(
|
||||
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
|
||||
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
|
||||
SpectralFeaturesView spectral_features) {
|
||||
// Analyze reference frame.
|
||||
fft_.ForwardFft(reference_frame, reference_frame_fft_);
|
||||
ComputeBandEnergies(reference_frame_fft_, band_boundaries_,
|
||||
reference_frame_energy_coeffs_);
|
||||
rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
|
||||
rtc::ArrayView<float, kNumLowerBands> average,
|
||||
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
|
||||
float* variability) {
|
||||
// Compute the Opus band energies for the reference frame.
|
||||
fft_.WindowedFft(reference_frame, reference_frame_fft_);
|
||||
spectral_correlator_.ComputeAutoCorrelation(
|
||||
{reference_frame_fft_.data(), kFftSizeBy2Plus1},
|
||||
reference_frame_bands_energy_);
|
||||
// Check if the reference frame has silence.
|
||||
const float tot_energy =
|
||||
std::accumulate(reference_frame_energy_coeffs_.begin(),
|
||||
reference_frame_energy_coeffs_.end(), 0.f);
|
||||
if (tot_energy < kSilenceThreshold)
|
||||
std::accumulate(reference_frame_bands_energy_.begin(),
|
||||
reference_frame_bands_energy_.end(), 0.f);
|
||||
if (tot_energy < kSilenceThreshold) {
|
||||
return true;
|
||||
// Analyze lagged frame.
|
||||
fft_.ForwardFft(lagged_frame, lagged_frame_fft_);
|
||||
ComputeBandEnergies(lagged_frame_fft_, band_boundaries_,
|
||||
lagged_frame_energy_coeffs_);
|
||||
}
|
||||
// Compute the Opus band energies for the lagged frame.
|
||||
fft_.WindowedFft(lagged_frame, lagged_frame_fft_);
|
||||
spectral_correlator_.ComputeAutoCorrelation(
|
||||
{lagged_frame_fft_.data(), kFftSizeBy2Plus1}, lagged_frame_bands_energy_);
|
||||
// Log of the band energies for the reference frame.
|
||||
std::array<float, kNumBands> log_band_energy_coeffs;
|
||||
ComputeLogBandEnergiesCoefficients(reference_frame_energy_coeffs_,
|
||||
log_band_energy_coeffs);
|
||||
// Decorrelate band-wise log energy coefficients via DCT.
|
||||
std::array<float, kNumBands> log_band_energy_coeffs_decorrelated;
|
||||
ComputeDct(log_band_energy_coeffs, dct_table_,
|
||||
log_band_energy_coeffs_decorrelated);
|
||||
// Normalize (based on training set stats).
|
||||
log_band_energy_coeffs_decorrelated[0] -= 12;
|
||||
log_band_energy_coeffs_decorrelated[1] -= 4;
|
||||
// Update the ring buffer and the spectral difference stats.
|
||||
spectral_coeffs_ring_buf_.Push(log_band_energy_coeffs_decorrelated);
|
||||
UpdateSpectralDifferenceStats(log_band_energy_coeffs_decorrelated,
|
||||
spectral_coeffs_ring_buf_,
|
||||
&spectral_diffs_buf_);
|
||||
// Write the higher bands spectral coefficients.
|
||||
auto coeffs_src = spectral_coeffs_ring_buf_.GetArrayView(0);
|
||||
RTC_DCHECK_EQ(coeffs_src.size() - kNumLowerBands,
|
||||
spectral_features.coeffs.size());
|
||||
std::copy(coeffs_src.begin() + kNumLowerBands, coeffs_src.end(),
|
||||
spectral_features.coeffs.begin());
|
||||
std::array<float, kNumBands> log_bands_energy;
|
||||
ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_,
|
||||
log_bands_energy);
|
||||
// Reference frame cepstrum.
|
||||
std::array<float, kNumBands> cepstrum;
|
||||
ComputeDct(log_bands_energy, dct_table_, cepstrum);
|
||||
// Ad-hoc correction terms for the first two cepstral coefficients.
|
||||
cepstrum[0] -= 12.f;
|
||||
cepstrum[1] -= 4.f;
|
||||
// Update the ring buffer and the cepstral difference stats.
|
||||
cepstral_coeffs_ring_buf_.Push(cepstrum);
|
||||
UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_,
|
||||
&cepstral_diffs_buf_);
|
||||
// Write the higher bands cepstral coefficients.
|
||||
RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size());
|
||||
std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(),
|
||||
higher_bands_cepstrum.begin());
|
||||
// Compute and write remaining features.
|
||||
ComputeAvgAndDerivatives(spectral_features.average,
|
||||
spectral_features.first_derivative,
|
||||
spectral_features.second_derivative);
|
||||
ComputeCrossCorrelation(spectral_features.cross_correlations);
|
||||
RTC_DCHECK(spectral_features.variability);
|
||||
*(spectral_features.variability) = ComputeVariability();
|
||||
ComputeAvgAndDerivatives(average, first_derivative, second_derivative);
|
||||
ComputeNormalizedCepstralCorrelation(bands_cross_corr);
|
||||
RTC_DCHECK(variability);
|
||||
*variability = ComputeVariability();
|
||||
return false;
|
||||
}
|
||||
|
||||
void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
|
||||
rtc::ArrayView<float, kNumLowerBands> average,
|
||||
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative) {
|
||||
auto curr = spectral_coeffs_ring_buf_.GetArrayView(0);
|
||||
auto prev1 = spectral_coeffs_ring_buf_.GetArrayView(1);
|
||||
auto prev2 = spectral_coeffs_ring_buf_.GetArrayView(2);
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative) const {
|
||||
auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0);
|
||||
auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1);
|
||||
auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2);
|
||||
RTC_DCHECK_EQ(average.size(), first_derivative.size());
|
||||
RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size());
|
||||
RTC_DCHECK_LE(average.size(), curr.size());
|
||||
@ -151,47 +131,41 @@ void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
|
||||
}
|
||||
}
|
||||
|
||||
void SpectralFeaturesExtractor::ComputeCrossCorrelation(
|
||||
rtc::ArrayView<float, kNumLowerBands> cross_correlations) {
|
||||
const auto& x = reference_frame_fft_;
|
||||
const auto& y = lagged_frame_fft_;
|
||||
auto cross_corr = [x, y](const size_t freq_bin_index) -> float {
|
||||
return (x[freq_bin_index].real() * y[freq_bin_index].real() +
|
||||
x[freq_bin_index].imag() * y[freq_bin_index].imag());
|
||||
};
|
||||
std::array<float, kNumBands> cross_corr_coeffs;
|
||||
constexpr size_t kNumFftPoints = kFrameSize20ms24kHz / 2 + 1;
|
||||
ComputeBandCoefficients(cross_corr, band_boundaries_, kNumFftPoints - 1,
|
||||
cross_corr_coeffs);
|
||||
void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation(
|
||||
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr) {
|
||||
spectral_correlator_.ComputeCrossCorrelation(
|
||||
{reference_frame_fft_.data(), kFftSizeBy2Plus1},
|
||||
{lagged_frame_fft_.data(), kFftSizeBy2Plus1}, bands_cross_corr_);
|
||||
// Normalize.
|
||||
for (size_t i = 0; i < cross_corr_coeffs.size(); ++i) {
|
||||
cross_corr_coeffs[i] =
|
||||
cross_corr_coeffs[i] /
|
||||
std::sqrt(0.001f + reference_frame_energy_coeffs_[i] *
|
||||
lagged_frame_energy_coeffs_[i]);
|
||||
for (size_t i = 0; i < bands_cross_corr_.size(); ++i) {
|
||||
bands_cross_corr_[i] =
|
||||
bands_cross_corr_[i] /
|
||||
std::sqrt(0.001f + reference_frame_bands_energy_[i] *
|
||||
lagged_frame_bands_energy_[i]);
|
||||
}
|
||||
// Decorrelate.
|
||||
ComputeDct(cross_corr_coeffs, dct_table_, cross_correlations);
|
||||
// Normalize (based on training set stats).
|
||||
cross_correlations[0] -= 1.3f;
|
||||
cross_correlations[1] -= 0.9f;
|
||||
// Cepstrum.
|
||||
ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr);
|
||||
// Ad-hoc correction terms for the first two cepstral coefficients.
|
||||
bands_cross_corr[0] -= 1.3f;
|
||||
bands_cross_corr[1] -= 0.9f;
|
||||
}
|
||||
|
||||
float SpectralFeaturesExtractor::ComputeVariability() {
|
||||
// Compute spectral variability score.
|
||||
float spec_variability = 0.f;
|
||||
for (size_t delay1 = 0; delay1 < kSpectralCoeffsHistorySize; ++delay1) {
|
||||
float SpectralFeaturesExtractor::ComputeVariability() const {
|
||||
// Compute cepstral variability score.
|
||||
float variability = 0.f;
|
||||
for (size_t delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) {
|
||||
float min_dist = std::numeric_limits<float>::max();
|
||||
for (size_t delay2 = 0; delay2 < kSpectralCoeffsHistorySize; ++delay2) {
|
||||
for (size_t delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) {
|
||||
if (delay1 == delay2) // The distance would be 0.
|
||||
continue;
|
||||
min_dist =
|
||||
std::min(min_dist, spectral_diffs_buf_.GetValue(delay1, delay2));
|
||||
std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2));
|
||||
}
|
||||
spec_variability += min_dist;
|
||||
variability += min_dist;
|
||||
}
|
||||
// Normalize (based on training set stats).
|
||||
return spec_variability / kSpectralCoeffsHistorySize - 2.1f;
|
||||
// TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction.
|
||||
return variability / kCepstralCoeffsHistorySize - 2.1f;
|
||||
}
|
||||
|
||||
} // namespace rnn_vad
|
||||
|
||||
@ -20,34 +20,12 @@
|
||||
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/fft_util.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
|
||||
// View on spectral features.
|
||||
class SpectralFeaturesView {
|
||||
public:
|
||||
SpectralFeaturesView(rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
|
||||
rtc::ArrayView<float, kNumLowerBands> average,
|
||||
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> cross_correlations,
|
||||
float* variability);
|
||||
SpectralFeaturesView(const SpectralFeaturesView&);
|
||||
~SpectralFeaturesView();
|
||||
// Higher bands spectral coefficients.
|
||||
const rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs;
|
||||
// Average and first and second derivative over time for the lower bands.
|
||||
const rtc::ArrayView<float, kNumLowerBands> average;
|
||||
const rtc::ArrayView<float, kNumLowerBands> first_derivative;
|
||||
const rtc::ArrayView<float, kNumLowerBands> second_derivative;
|
||||
// Spectral cross-correlation for the lower bands.
|
||||
const rtc::ArrayView<float, kNumLowerBands> cross_correlations;
|
||||
// Spectral variability score.
|
||||
float* const variability;
|
||||
};
|
||||
|
||||
// Class to compute spectral features.
|
||||
class SpectralFeaturesExtractor {
|
||||
public:
|
||||
@ -64,27 +42,33 @@ class SpectralFeaturesExtractor {
|
||||
bool CheckSilenceComputeFeatures(
|
||||
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
|
||||
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
|
||||
SpectralFeaturesView spectral_features);
|
||||
rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
|
||||
rtc::ArrayView<float, kNumLowerBands> average,
|
||||
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
|
||||
float* variability);
|
||||
|
||||
private:
|
||||
void ComputeAvgAndDerivatives(
|
||||
rtc::ArrayView<float, kNumLowerBands> average,
|
||||
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative);
|
||||
void ComputeCrossCorrelation(
|
||||
rtc::ArrayView<float, kNumLowerBands> cross_correlations);
|
||||
float ComputeVariability();
|
||||
rtc::ArrayView<float, kNumLowerBands> second_derivative) const;
|
||||
void ComputeNormalizedCepstralCorrelation(
|
||||
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr);
|
||||
float ComputeVariability() const;
|
||||
|
||||
BandAnalysisFft fft_;
|
||||
FftUtil fft_;
|
||||
std::vector<std::complex<float>> reference_frame_fft_;
|
||||
std::vector<std::complex<float>> lagged_frame_fft_;
|
||||
std::array<float, kNumBands> reference_frame_energy_coeffs_{};
|
||||
std::array<float, kNumBands> lagged_frame_energy_coeffs_{};
|
||||
const std::array<size_t, kNumBands> band_boundaries_;
|
||||
SpectralCorrelator spectral_correlator_;
|
||||
std::array<float, kOpusBands24kHz> reference_frame_bands_energy_;
|
||||
std::array<float, kOpusBands24kHz> lagged_frame_bands_energy_;
|
||||
std::array<float, kOpusBands24kHz> bands_cross_corr_;
|
||||
const std::array<float, kNumBands * kNumBands> dct_table_;
|
||||
RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>
|
||||
spectral_coeffs_ring_buf_;
|
||||
SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize> spectral_diffs_buf_;
|
||||
RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>
|
||||
cepstral_coeffs_ring_buf_;
|
||||
SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize> cepstral_diffs_buf_;
|
||||
};
|
||||
|
||||
} // namespace rnn_vad
|
||||
|
||||
@ -20,85 +20,126 @@ namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
namespace {
|
||||
|
||||
// DCT scaling factor.
|
||||
static_assert(
|
||||
kNumBands == 22,
|
||||
"kNumBands changed! Please update the value of kDctScalingFactor");
|
||||
constexpr float kDctScalingFactor = 0.301511345f; // sqrt(2 / kNumBands)
|
||||
// Weights for each FFT coefficient for each Opus band (Nyquist frequency
|
||||
// excluded). The size of each band is specified in
|
||||
// |kOpusScaleNumBins24kHz20ms|.
|
||||
constexpr std::array<float, kFrameSize20ms24kHz / 2> kOpusBandWeights24kHz20ms =
|
||||
{{
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 0
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 1
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 2
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 3
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 4
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 5
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 6
|
||||
0.f, 0.25f, 0.5f, 0.75f, // Band 7
|
||||
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
|
||||
0.625f, 0.75f, 0.875f, // Band 8
|
||||
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
|
||||
0.625f, 0.75f, 0.875f, // Band 9
|
||||
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
|
||||
0.625f, 0.75f, 0.875f, // Band 10
|
||||
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
|
||||
0.625f, 0.75f, 0.875f, // Band 11
|
||||
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
|
||||
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
|
||||
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
|
||||
0.9375f, // Band 12
|
||||
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
|
||||
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
|
||||
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
|
||||
0.9375f, // Band 13
|
||||
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
|
||||
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
|
||||
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
|
||||
0.9375f, // Band 14
|
||||
0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
|
||||
0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
|
||||
0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
|
||||
0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
|
||||
0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 15
|
||||
0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
|
||||
0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
|
||||
0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
|
||||
0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
|
||||
0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 16
|
||||
0.f, 0.03125f, 0.0625f, 0.09375f, 0.125f,
|
||||
0.15625f, 0.1875f, 0.21875f, 0.25f, 0.28125f,
|
||||
0.3125f, 0.34375f, 0.375f, 0.40625f, 0.4375f,
|
||||
0.46875f, 0.5f, 0.53125f, 0.5625f, 0.59375f,
|
||||
0.625f, 0.65625f, 0.6875f, 0.71875f, 0.75f,
|
||||
0.78125f, 0.8125f, 0.84375f, 0.875f, 0.90625f,
|
||||
0.9375f, 0.96875f, // Band 17
|
||||
0.f, 0.0208333f, 0.0416667f, 0.0625f, 0.0833333f,
|
||||
0.104167f, 0.125f, 0.145833f, 0.166667f, 0.1875f,
|
||||
0.208333f, 0.229167f, 0.25f, 0.270833f, 0.291667f,
|
||||
0.3125f, 0.333333f, 0.354167f, 0.375f, 0.395833f,
|
||||
0.416667f, 0.4375f, 0.458333f, 0.479167f, 0.5f,
|
||||
0.520833f, 0.541667f, 0.5625f, 0.583333f, 0.604167f,
|
||||
0.625f, 0.645833f, 0.666667f, 0.6875f, 0.708333f,
|
||||
0.729167f, 0.75f, 0.770833f, 0.791667f, 0.8125f,
|
||||
0.833333f, 0.854167f, 0.875f, 0.895833f, 0.916667f,
|
||||
0.9375f, 0.958333f, 0.979167f // Band 18
|
||||
}};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
|
||||
size_t sample_rate_hz,
|
||||
size_t frame_size_samples) {
|
||||
std::array<size_t, kNumBands> indexes;
|
||||
for (size_t i = 0; i < kNumBands; ++i) {
|
||||
indexes[i] =
|
||||
kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz;
|
||||
}
|
||||
return indexes;
|
||||
SpectralCorrelator::SpectralCorrelator()
|
||||
: weights_(kOpusBandWeights24kHz20ms.begin(),
|
||||
kOpusBandWeights24kHz20ms.end()) {}
|
||||
|
||||
SpectralCorrelator::~SpectralCorrelator() = default;
|
||||
|
||||
void SpectralCorrelator::ComputeAutoCorrelation(
|
||||
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
|
||||
rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const {
|
||||
ComputeCrossCorrelation(x, x, auto_corr);
|
||||
}
|
||||
|
||||
void ComputeBandCoefficients(
|
||||
rtc::FunctionView<float(size_t)> functor,
|
||||
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||
size_t max_freq_bin_index,
|
||||
rtc::ArrayView<float, kNumBands> coefficients) {
|
||||
std::fill(coefficients.begin(), coefficients.end(), 0.f);
|
||||
for (size_t i = 0; i < coefficients.size() - 1; ++i) {
|
||||
RTC_DCHECK_EQ(0.f, coefficients[i + 1]);
|
||||
RTC_DCHECK_GT(band_boundaries[i + 1], band_boundaries[i]);
|
||||
const size_t first_freq_bin = band_boundaries[i];
|
||||
const size_t last_freq_bin =
|
||||
std::min(max_freq_bin_index, first_freq_bin + band_boundaries[i + 1] -
|
||||
band_boundaries[i] - 1);
|
||||
// Depending on the sample rate, the highest bands can have no FFT
|
||||
// coefficients. Stop the iteration when coming across the first empty band.
|
||||
if (first_freq_bin >= last_freq_bin)
|
||||
break;
|
||||
const size_t band_size = last_freq_bin - first_freq_bin + 1;
|
||||
// Compute the band coefficient using a triangular band with peak response
|
||||
// at the band boundary.
|
||||
for (size_t j = first_freq_bin; j <= last_freq_bin; ++j) {
|
||||
const float w = static_cast<float>(j - first_freq_bin) / band_size;
|
||||
const float coefficient = functor(j);
|
||||
coefficients[i] += (1.f - w) * coefficient;
|
||||
coefficients[i + 1] += w * coefficient;
|
||||
void SpectralCorrelator::ComputeCrossCorrelation(
|
||||
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
|
||||
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> y,
|
||||
rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const {
|
||||
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
|
||||
size_t k = 0; // Next Fourier coefficient index.
|
||||
cross_corr[0] = 0.f;
|
||||
for (size_t i = 0; i < kOpusBands24kHz - 1; ++i) {
|
||||
cross_corr[i + 1] = 0.f;
|
||||
for (int j = 0; j < kOpusScaleNumBins24kHz20ms[i]; ++j) { // Band size.
|
||||
const float v = x[k].real() * y[k].real() + x[k].imag() * y[k].imag();
|
||||
const float tmp = weights_[k] * v;
|
||||
cross_corr[i] += v - tmp;
|
||||
cross_corr[i + 1] += tmp;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
// The first and the last bands in the loop above only got half contribution.
|
||||
coefficients[0] *= 2.f;
|
||||
coefficients[coefficients.size() - 1] *= 2.f;
|
||||
// TODO(bugs.webrtc.org/9076): Replace the line above with
|
||||
// "coefficients[i] *= 2.f" (*) since we now assume that the last band is
|
||||
// always |kNumBands| - 1.
|
||||
// (*): "size_t i" must be declared before the main loop.
|
||||
cross_corr[0] *= 2.f; // The first band only gets half contribution.
|
||||
// The Nyquist coefficient is never used.
|
||||
RTC_DCHECK_EQ(k, kFftSizeBy2Plus1 - 1);
|
||||
}
|
||||
|
||||
void ComputeBandEnergies(
|
||||
rtc::ArrayView<const std::complex<float>> fft_coeffs,
|
||||
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||
rtc::ArrayView<float, kNumBands> band_energies) {
|
||||
RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size());
|
||||
auto functor = [fft_coeffs](const size_t freq_bin_index) -> float {
|
||||
return std::norm(fft_coeffs[freq_bin_index]);
|
||||
void ComputeSmoothedLogMagnitudeSpectrum(
|
||||
rtc::ArrayView<const float> bands_energy,
|
||||
rtc::ArrayView<float, kNumBands> log_bands_energy) {
|
||||
RTC_DCHECK_LE(bands_energy.size(), kNumBands);
|
||||
constexpr float kOneByHundred = 1e-2f;
|
||||
constexpr float kLogOneByHundred = -2.f;
|
||||
// Init.
|
||||
float log_max = kLogOneByHundred;
|
||||
float follow = kLogOneByHundred;
|
||||
const auto smooth = [&log_max, &follow](float x) {
|
||||
x = std::max(log_max - 7.f, std::max(follow - 1.5f, x));
|
||||
log_max = std::max(log_max, x);
|
||||
follow = std::max(follow - 1.5f, x);
|
||||
return x;
|
||||
};
|
||||
ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1,
|
||||
band_energies);
|
||||
}
|
||||
|
||||
void ComputeLogBandEnergiesCoefficients(
|
||||
rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
|
||||
rtc::ArrayView<float, kNumBands> log_band_energy_coeffs) {
|
||||
float log_max = -2.f;
|
||||
float follow = -2.f;
|
||||
for (size_t i = 0; i < band_energy_coeffs.size(); ++i) {
|
||||
log_band_energy_coeffs[i] = std::log10(1e-2f + band_energy_coeffs[i]);
|
||||
// Smoothing across frequency bands.
|
||||
log_band_energy_coeffs[i] = std::max(
|
||||
log_max - 7.f, std::max(follow - 1.5f, log_band_energy_coeffs[i]));
|
||||
log_max = std::max(log_max, log_band_energy_coeffs[i]);
|
||||
follow = std::max(follow - 1.5f, log_band_energy_coeffs[i]);
|
||||
// Smoothing over the bands for which the band energy is defined.
|
||||
for (size_t i = 0; i < bands_energy.size(); ++i) {
|
||||
log_bands_energy[i] = smooth(std::log10(kOneByHundred + bands_energy[i]));
|
||||
}
|
||||
// Smoothing over the remaining bands (zero energy).
|
||||
for (size_t i = bands_energy.size(); i < kNumBands; ++i) {
|
||||
log_bands_energy[i] = smooth(kLogOneByHundred);
|
||||
}
|
||||
}
|
||||
|
||||
@ -113,17 +154,28 @@ std::array<float, kNumBands * kNumBands> ComputeDctTable() {
|
||||
return dct_table;
|
||||
}
|
||||
|
||||
void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
|
||||
void ComputeDct(rtc::ArrayView<const float> in,
|
||||
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
|
||||
rtc::ArrayView<float> out) {
|
||||
// DCT scaling factor - i.e., sqrt(2 / kNumBands).
|
||||
constexpr float kDctScalingFactor = 0.301511345f;
|
||||
constexpr float kDctScalingFactorError =
|
||||
kDctScalingFactor * kDctScalingFactor -
|
||||
2.f / static_cast<float>(kNumBands);
|
||||
static_assert(
|
||||
(kDctScalingFactorError >= 0.f && kDctScalingFactorError < 1e-1f) ||
|
||||
(kDctScalingFactorError < 0.f && kDctScalingFactorError > -1e-1f),
|
||||
"kNumBands changed and kDctScalingFactor has not been updated.");
|
||||
RTC_DCHECK_NE(in.data(), out.data()) << "In-place DCT is not supported.";
|
||||
RTC_DCHECK_LE(in.size(), kNumBands);
|
||||
RTC_DCHECK_LE(1, out.size());
|
||||
RTC_DCHECK_LE(out.size(), in.size());
|
||||
std::fill(out.begin(), out.end(), 0.f);
|
||||
for (size_t i = 0; i < out.size(); ++i) {
|
||||
out[i] = 0.f;
|
||||
for (size_t j = 0; j < in.size(); ++j) {
|
||||
out[i] += in[j] * dct_table[j * in.size() + i];
|
||||
out[i] += in[j] * dct_table[j * kNumBands + i];
|
||||
}
|
||||
// TODO(bugs.webrtc.org/10480): Scaling factor in the DCT table.
|
||||
out[i] *= kDctScalingFactor;
|
||||
}
|
||||
}
|
||||
|
||||
@ -14,49 +14,75 @@
|
||||
#include <stddef.h>
|
||||
#include <array>
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
|
||||
#include "api/array_view.h"
|
||||
#include "api/function_view.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
|
||||
// Computes FFT boundary indexes corresponding to sub-bands.
|
||||
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
|
||||
size_t sample_rate_hz,
|
||||
size_t frame_size_samples);
|
||||
// At a sample rate of 24 kHz, the last 3 Opus bands are beyond the Nyquist
|
||||
// frequency. However, band #19 gets the contributions from band #18 because
|
||||
// of the symmetric triangular filter with peak response at 12 kHz.
|
||||
constexpr size_t kOpusBands24kHz = 20;
|
||||
static_assert(kOpusBands24kHz < kNumBands,
|
||||
"The number of bands at 24 kHz must be less than those defined "
|
||||
"in the Opus scale at 48 kHz.");
|
||||
|
||||
// Iterates through frequency bands and computes coefficients via |functor| for
|
||||
// triangular bands with peak response at each band boundary. |functor| returns
|
||||
// a floating point value for the FFT coefficient having index equal to the
|
||||
// argument passed to |functor|; that argument is in the range {0, ...
|
||||
// |max_freq_bin_index| - 1}.
|
||||
void ComputeBandCoefficients(
|
||||
rtc::FunctionView<float(size_t)> functor,
|
||||
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||
const size_t max_freq_bin_index,
|
||||
rtc::ArrayView<float, kNumBands> coefficients);
|
||||
// Number of FFT frequency bins covered by each band in the Opus scale at a
|
||||
// sample rate of 24 kHz for 20 ms frames.
|
||||
// Declared here for unit testing.
|
||||
constexpr std::array<int, kOpusBands24kHz - 1> GetOpusScaleNumBins24kHz20ms() {
|
||||
return {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 24, 24, 32, 48};
|
||||
}
|
||||
|
||||
// Given an array of FFT coefficients and a vector of band boundary indexes,
|
||||
// computes band energy coefficients.
|
||||
void ComputeBandEnergies(
|
||||
rtc::ArrayView<const std::complex<float>> fft_coeffs,
|
||||
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||
rtc::ArrayView<float, kNumBands> band_energies);
|
||||
// TODO(bugs.webrtc.org/10480): Move to a separate file.
|
||||
// Class to compute band-wise spectral features in the Opus perceptual scale
|
||||
// for 20 ms frames sampled at 24 kHz. The analysis methods apply triangular
|
||||
// filters with peak response at the each band boundary.
|
||||
class SpectralCorrelator {
|
||||
public:
|
||||
// Ctor.
|
||||
SpectralCorrelator();
|
||||
SpectralCorrelator(const SpectralCorrelator&) = delete;
|
||||
SpectralCorrelator& operator=(const SpectralCorrelator&) = delete;
|
||||
~SpectralCorrelator();
|
||||
|
||||
// Computes log band energy coefficients.
|
||||
void ComputeLogBandEnergiesCoefficients(
|
||||
rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
|
||||
rtc::ArrayView<float, kNumBands> log_band_energy_coeffs);
|
||||
// Computes the band-wise spectral auto-correlations.
|
||||
void ComputeAutoCorrelation(
|
||||
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
|
||||
rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const;
|
||||
|
||||
// Creates a DCT table for arrays having size equal to |kNumBands|.
|
||||
// Computes the band-wise spectral cross-correlations.
|
||||
void ComputeCrossCorrelation(
|
||||
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
|
||||
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> y,
|
||||
rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const;
|
||||
|
||||
private:
|
||||
const std::vector<float> weights_; // Weights for each Fourier coefficient.
|
||||
};
|
||||
|
||||
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
|
||||
// spectral_features.cc. Given a vector of Opus-bands energy coefficients,
|
||||
// computes the log magnitude spectrum applying smoothing both over time and
|
||||
// over frequency. Declared here for unit testing.
|
||||
void ComputeSmoothedLogMagnitudeSpectrum(
|
||||
rtc::ArrayView<const float> bands_energy,
|
||||
rtc::ArrayView<float, kNumBands> log_bands_energy);
|
||||
|
||||
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
|
||||
// spectral_features.cc. Creates a DCT table for arrays having size equal to
|
||||
// |kNumBands|. Declared here for unit testing.
|
||||
std::array<float, kNumBands * kNumBands> ComputeDctTable();
|
||||
|
||||
// Computes DCT for |in| given a pre-computed DCT table. In-place computation is
|
||||
// not allowed and |out| can be smaller than |in| in order to only compute the
|
||||
// first DCT coefficients.
|
||||
void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
|
||||
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
|
||||
// spectral_features.cc. Computes DCT for |in| given a pre-computed DCT table.
|
||||
// In-place computation is not allowed and |out| can be smaller than |in| in
|
||||
// order to only compute the first DCT coefficients. Declared here for unit
|
||||
// testing.
|
||||
void ComputeDct(rtc::ArrayView<const float> in,
|
||||
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
|
||||
rtc::ArrayView<float> out);
|
||||
|
||||
|
||||
@ -10,6 +10,13 @@
|
||||
|
||||
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <complex>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "api/array_view.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
|
||||
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
|
||||
// #include "test/fpe_observer.h"
|
||||
@ -20,58 +27,76 @@ namespace rnn_vad {
|
||||
namespace test {
|
||||
namespace {
|
||||
|
||||
constexpr size_t kSampleRate48kHz = 48000;
|
||||
constexpr size_t kFrameSize20ms48kHz = 2 * kSampleRate48kHz / 100;
|
||||
constexpr size_t kFftNumCoeffs20ms48kHz = kFrameSize20ms48kHz / 2 + 1;
|
||||
// Generates the values for the array named |kOpusBandWeights24kHz20ms| in the
|
||||
// anonymous namespace of the .cc file, which is the array of FFT coefficient
|
||||
// weights for the Opus scale triangular filters.
|
||||
std::vector<float> ComputeTriangularFiltersWeights() {
|
||||
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
|
||||
const auto& v = kOpusScaleNumBins24kHz20ms; // Alias.
|
||||
const size_t num_weights = std::accumulate(
|
||||
kOpusScaleNumBins24kHz20ms.begin(), kOpusScaleNumBins24kHz20ms.end(), 0);
|
||||
std::vector<float> weights(num_weights);
|
||||
size_t next_fft_coeff_index = 0;
|
||||
for (size_t band = 0; band < v.size(); ++band) {
|
||||
const size_t band_size = v[band];
|
||||
for (size_t j = 0; j < band_size; ++j) {
|
||||
weights[next_fft_coeff_index + j] = static_cast<float>(j) / band_size;
|
||||
}
|
||||
next_fft_coeff_index += band_size;
|
||||
}
|
||||
return weights;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO(bugs.webrtc.org/9076): Remove this test before closing the issue.
|
||||
// Check that when using precomputed FFT coefficients for frames at 48 kHz, the
|
||||
// output of ComputeBandEnergies() is bit exact.
|
||||
TEST(RnnVadTest, ComputeBandEnergies48kHzBitExactness) {
|
||||
// Initialize input data reader and buffers.
|
||||
auto fft_coeffs_reader = CreateFftCoeffsReader();
|
||||
const size_t num_frames = fft_coeffs_reader.second;
|
||||
ASSERT_EQ(
|
||||
kFftNumCoeffs20ms48kHz,
|
||||
rtc::CheckedDivExact(fft_coeffs_reader.first->data_length(), num_frames) /
|
||||
2);
|
||||
std::array<float, kFftNumCoeffs20ms48kHz> fft_coeffs_real;
|
||||
std::array<float, kFftNumCoeffs20ms48kHz> fft_coeffs_imag;
|
||||
std::array<std::complex<float>, kFftNumCoeffs20ms48kHz> fft_coeffs;
|
||||
// Init expected output reader and buffer.
|
||||
auto band_energies_reader = CreateBandEnergyCoeffsReader();
|
||||
ASSERT_EQ(num_frames, band_energies_reader.second);
|
||||
std::array<float, kNumBands> expected_band_energies;
|
||||
// Init band energies coefficients computation.
|
||||
const auto band_boundary_indexes =
|
||||
ComputeBandBoundaryIndexes(kSampleRate48kHz, kFrameSize20ms48kHz);
|
||||
std::array<float, kNumBands> computed_band_energies;
|
||||
|
||||
// Check output for every frame.
|
||||
{
|
||||
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
|
||||
// FloatingPointExceptionObserver fpe_observer;
|
||||
for (size_t i = 0; i < num_frames; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
// Read input.
|
||||
fft_coeffs_reader.first->ReadChunk(fft_coeffs_real);
|
||||
fft_coeffs_reader.first->ReadChunk(fft_coeffs_imag);
|
||||
for (size_t i = 0; i < kFftNumCoeffs20ms48kHz; ++i) {
|
||||
fft_coeffs[i].real(fft_coeffs_real[i]);
|
||||
fft_coeffs[i].imag(fft_coeffs_imag[i]);
|
||||
}
|
||||
band_energies_reader.first->ReadChunk(expected_band_energies);
|
||||
// Compute band energy coefficients and check output.
|
||||
ComputeBandEnergies(fft_coeffs, band_boundary_indexes,
|
||||
computed_band_energies);
|
||||
ExpectEqualFloatArray(expected_band_energies, computed_band_energies);
|
||||
}
|
||||
// Checks that the values returned by GetOpusScaleNumBins24kHz20ms() match the
|
||||
// Opus scale frequency boundaries.
|
||||
TEST(RnnVadTest, TestOpusScaleBoundaries) {
|
||||
constexpr int kBandFrequencyBoundariesHz[kNumBands - 1] = {
|
||||
200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400, 2800,
|
||||
3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
|
||||
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
|
||||
int prev = 0;
|
||||
for (size_t i = 0; i < kOpusScaleNumBins24kHz20ms.size(); ++i) {
|
||||
int boundary =
|
||||
kBandFrequencyBoundariesHz[i] * kFrameSize20ms24kHz / kSampleRate24kHz;
|
||||
EXPECT_EQ(kOpusScaleNumBins24kHz20ms[i], boundary - prev);
|
||||
prev = boundary;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) {
|
||||
// Checks that the computed triangular filters weights for the Opus scale are
|
||||
// monotonic withing each Opus band. This test should only be enabled when
|
||||
// ComputeTriangularFiltersWeights() is changed and |kOpusBandWeights24kHz20ms|
|
||||
// is updated accordingly.
|
||||
TEST(RnnVadTest, DISABLED_TestOpusScaleWeights) {
|
||||
auto weights = ComputeTriangularFiltersWeights();
|
||||
size_t i = 0;
|
||||
for (size_t band_size : GetOpusScaleNumBins24kHz20ms()) {
|
||||
SCOPED_TRACE(band_size);
|
||||
rtc::ArrayView<float> band_weights(weights.data() + i, band_size);
|
||||
float prev = -1.f;
|
||||
for (float weight : band_weights) {
|
||||
EXPECT_LT(prev, weight);
|
||||
prev = weight;
|
||||
}
|
||||
i += band_size;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RnnVadTest, SpectralCorrelatorValidOutput) {
|
||||
SpectralCorrelator e;
|
||||
std::array<std::complex<float>, kFftSizeBy2Plus1> in;
|
||||
std::array<float, kOpusBands24kHz> out;
|
||||
in.fill({1.f, 1.f});
|
||||
e.ComputeAutoCorrelation(in, out);
|
||||
for (size_t i = 0; i < kOpusBands24kHz; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
EXPECT_GT(out[i], 0.f);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RnnVadTest, ComputeSmoothedLogMagnitudeSpectrumWithinTolerance) {
|
||||
constexpr std::array<float, kNumBands> input = {
|
||||
{86.060539245605f, 275.668334960938f, 43.406528472900f, 6.541896820068f,
|
||||
17.964015960693f, 8.090919494629f, 1.261920094490f, 1.212702631950f,
|
||||
@ -90,7 +115,7 @@ TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) {
|
||||
{
|
||||
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
|
||||
// FloatingPointExceptionObserver fpe_observer;
|
||||
ComputeLogBandEnergiesCoefficients(input, computed_output);
|
||||
ComputeSmoothedLogMagnitudeSpectrum(input, computed_output);
|
||||
ExpectNearAbsolute(expected_output, computed_output, 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,15 +32,35 @@ void WriteTestData(rtc::ArrayView<float> samples) {
|
||||
}
|
||||
}
|
||||
|
||||
SpectralFeaturesView GetSpectralFeaturesView(
|
||||
rtc::ArrayView<float, kNumBands - kNumLowerBands> GetHigherBandsSpectrum(
|
||||
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||
return {
|
||||
{feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands},
|
||||
{feature_vector->data(), kNumLowerBands},
|
||||
{feature_vector->data() + kNumBands, kNumLowerBands},
|
||||
{feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands},
|
||||
{feature_vector->data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
|
||||
&(*feature_vector)[kNumBands + 3 * kNumLowerBands]};
|
||||
return {feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands};
|
||||
}
|
||||
|
||||
rtc::ArrayView<float, kNumLowerBands> GetAverage(
|
||||
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||
return {feature_vector->data(), kNumLowerBands};
|
||||
}
|
||||
|
||||
rtc::ArrayView<float, kNumLowerBands> GetFirstDerivative(
|
||||
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||
return {feature_vector->data() + kNumBands, kNumLowerBands};
|
||||
}
|
||||
|
||||
rtc::ArrayView<float, kNumLowerBands> GetSecondDerivative(
|
||||
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||
return {feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands};
|
||||
}
|
||||
|
||||
rtc::ArrayView<float, kNumLowerBands> GetCepstralCrossCorrelation(
|
||||
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||
return {feature_vector->data() + kNumBands + 2 * kNumLowerBands,
|
||||
kNumLowerBands};
|
||||
}
|
||||
|
||||
float* GetCepstralVariability(
|
||||
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||
return feature_vector->data() + kNumBands + 3 * kNumLowerBands;
|
||||
}
|
||||
|
||||
constexpr float kInitialFeatureVal = -9999.f;
|
||||
@ -54,7 +74,6 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
|
||||
rtc::ArrayView<float, kFrameSize20ms24kHz> samples_view(samples);
|
||||
bool is_silence;
|
||||
std::array<float, kTestFeatureVectorSize> feature_vector;
|
||||
auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
|
||||
|
||||
// Write an initial value in the feature vector to detect changes.
|
||||
std::fill(feature_vector.begin(), feature_vector.end(), kInitialFeatureVal);
|
||||
@ -64,8 +83,12 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
|
||||
|
||||
// With silence.
|
||||
std::fill(samples.begin(), samples.end(), 0.f);
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||
feature_vector_view);
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(
|
||||
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
|
||||
GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
|
||||
GetSecondDerivative(&feature_vector),
|
||||
GetCepstralCrossCorrelation(&feature_vector),
|
||||
GetCepstralVariability(&feature_vector));
|
||||
// Silence is expected, the output won't be overwritten.
|
||||
EXPECT_TRUE(is_silence);
|
||||
EXPECT_TRUE(std::all_of(feature_vector.begin(), feature_vector.end(),
|
||||
@ -73,18 +96,22 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
|
||||
|
||||
// With no silence.
|
||||
WriteTestData(samples);
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||
feature_vector_view);
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(
|
||||
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
|
||||
GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
|
||||
GetSecondDerivative(&feature_vector),
|
||||
GetCepstralCrossCorrelation(&feature_vector),
|
||||
GetCepstralVariability(&feature_vector));
|
||||
// Silence is not expected, the output will be overwritten.
|
||||
EXPECT_FALSE(is_silence);
|
||||
EXPECT_FALSE(std::all_of(feature_vector.begin(), feature_vector.end(),
|
||||
[](float x) { return x == kInitialFeatureVal; }));
|
||||
}
|
||||
|
||||
// When the input signal does not change, the spectral coefficients average does
|
||||
// not change and the derivatives are zero. Similarly, the spectral variability
|
||||
// When the input signal does not change, the cepstral coefficients average does
|
||||
// not change and the derivatives are zero. Similarly, the cepstral variability
|
||||
// score does not change either.
|
||||
TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
|
||||
TEST(RnnVadTest, CepstralFeaturesConstantAverageZeroDerivative) {
|
||||
// Initialize.
|
||||
SpectralFeaturesExtractor sfe;
|
||||
std::array<float, kFrameSize20ms24kHz> samples;
|
||||
@ -94,17 +121,24 @@ TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
|
||||
|
||||
// Fill the spectral features with test data.
|
||||
std::array<float, kTestFeatureVectorSize> feature_vector;
|
||||
auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
|
||||
for (size_t i = 0; i < kSpectralCoeffsHistorySize; ++i) {
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||
feature_vector_view);
|
||||
for (size_t i = 0; i < kCepstralCoeffsHistorySize; ++i) {
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(
|
||||
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
|
||||
GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
|
||||
GetSecondDerivative(&feature_vector),
|
||||
GetCepstralCrossCorrelation(&feature_vector),
|
||||
GetCepstralVariability(&feature_vector));
|
||||
}
|
||||
|
||||
// Feed the test data one last time but using a different output vector.
|
||||
std::array<float, kTestFeatureVectorSize> feature_vector_last;
|
||||
auto feature_vector_last_view = GetSpectralFeaturesView(&feature_vector_last);
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||
feature_vector_last_view);
|
||||
is_silence = sfe.CheckSilenceComputeFeatures(
|
||||
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector_last),
|
||||
GetAverage(&feature_vector_last),
|
||||
GetFirstDerivative(&feature_vector_last),
|
||||
GetSecondDerivative(&feature_vector_last),
|
||||
GetCepstralCrossCorrelation(&feature_vector_last),
|
||||
GetCepstralVariability(&feature_vector_last));
|
||||
|
||||
// Average is unchanged.
|
||||
ExpectEqualFloatArray({feature_vector.data(), kNumLowerBands},
|
||||
@ -116,7 +150,7 @@ TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
|
||||
ExpectEqualFloatArray(
|
||||
{feature_vector_last.data() + kNumBands + kNumLowerBands, kNumLowerBands},
|
||||
zeros);
|
||||
// Spectral variability is unchanged.
|
||||
// Variability is unchanged.
|
||||
EXPECT_FLOAT_EQ(feature_vector[kNumBands + 3 * kNumLowerBands],
|
||||
feature_vector_last[kNumBands + 3 * kNumLowerBands]);
|
||||
}
|
||||
|
||||
@ -87,14 +87,6 @@ ReaderPairType CreateFftCoeffsReader() {
|
||||
return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), row_size)};
|
||||
}
|
||||
|
||||
ReaderPairType CreateBandEnergyCoeffsReader() {
|
||||
constexpr size_t num_bands = 22;
|
||||
auto ptr = absl::make_unique<BinaryFileReader<float>>(
|
||||
test::ResourcePath("audio_processing/agc2/rnn_vad/band_energies", "dat"),
|
||||
num_bands);
|
||||
return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), num_bands)};
|
||||
}
|
||||
|
||||
ReaderPairType CreateSilenceFlagsFeatureMatrixReader() {
|
||||
constexpr size_t feature_vector_size = 42;
|
||||
auto ptr = absl::make_unique<BinaryFileReader<float>>(
|
||||
|
||||
@ -110,9 +110,6 @@ CreateLpResidualAndPitchPeriodGainReader();
|
||||
// Creates a reader for the FFT coefficients.
|
||||
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
||||
CreateFftCoeffsReader();
|
||||
// Instance a reader for the band energy coefficients.
|
||||
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
||||
CreateBandEnergyCoeffsReader();
|
||||
// Creates a reader for the silence flags and the feature matrix.
|
||||
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
||||
CreateSilenceFlagsFeatureMatrixReader();
|
||||
|
||||
Reference in New Issue
Block a user