RNN VAD: Opus band spectral analysis refactoring

This CL refactors the computation of band energy and spectral
cross-correlation coefficients by moving and optimizing
the code from ComputeBandCoefficients, ComputeBandEnergies and
ComputeSpectralCrossCorrelation into a single class (named
BandFeaturesExtractor).

This change will also help replacing FFT library in the RNN VAD.

Bug: webrtc:10480
Change-Id: I6cefa23e8f3bc8de6eb09d3ea434699d5e19124e
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/129726
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#27535}
This commit is contained in:
Alessio Bazzica
2019-04-10 09:36:21 +02:00
committed by Commit Bot
parent d93a004086
commit 4a53766c84
14 changed files with 439 additions and 361 deletions

View File

@ -38,7 +38,6 @@ rtc_source_set("rnn_vad") {
deps = [
"..:biquad_filter",
"../../../../api:array_view",
"../../../../api:function_view",
"../../../../rtc_base:checks",
"../../../../rtc_base:rtc_base_approved",
"../../utility:pffft_wrapper",

View File

@ -52,17 +52,13 @@ constexpr size_t kNumInvertedLags12kHz = kMaxPitch12kHz - kInitialMinPitch12kHz;
constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2;
constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2;
// Sub-band frequency boundaries.
// Spectral features.
constexpr size_t kFftSizeBy2Plus1 = kFrameSize20ms24kHz / 2 + 1;
constexpr size_t kNumBands = 22;
constexpr int kBandFrequencyBoundaries[kNumBands] = {
0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400,
2800, 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
// Feature extraction parameters.
constexpr size_t kNumLowerBands = 6;
static_assert((0 < kNumLowerBands) && (kNumLowerBands < kNumBands), "");
constexpr size_t kSpectralCoeffsHistorySize = 8;
static_assert(kSpectralCoeffsHistorySize > 2,
constexpr size_t kCepstralCoeffsHistorySize = 8;
static_assert(kCepstralCoeffsHistorySize > 2,
"The history size must at least be 3 to compute first and second "
"derivatives.");

View File

@ -78,12 +78,12 @@ bool FeaturesExtractor::CheckSilenceComputeFeatures(
// and write the feature vector.
return spectral_features_extractor_.CheckSilenceComputeFeatures(
reference_frame_view_, {lagged_frame.data(), kFrameSize20ms24kHz},
{{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
{feature_vector.data(), kNumLowerBands},
{feature_vector.data() + kNumBands, kNumLowerBands},
{feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
{feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
&feature_vector[kFeatureVectorSize - 1]});
{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
{feature_vector.data(), kNumLowerBands},
{feature_vector.data() + kNumBands, kNumLowerBands},
{feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
{feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
&feature_vector[kFeatureVectorSize - 1]);
}
} // namespace rnn_vad

View File

@ -35,16 +35,16 @@ std::array<float, kHalfFrameSize> ComputeHalfVorbisWindow() {
} // namespace
BandAnalysisFft::BandAnalysisFft()
FftUtil::FftUtil()
: half_window_(ComputeHalfVorbisWindow()),
fft_(static_cast<int>(input_buf_.size())) {}
BandAnalysisFft::~BandAnalysisFft() = default;
FftUtil::~FftUtil() = default;
void BandAnalysisFft::ForwardFft(rtc::ArrayView<const float> samples,
rtc::ArrayView<std::complex<float>> dst) {
void FftUtil::WindowedFft(rtc::ArrayView<const float> samples,
rtc::ArrayView<std::complex<float>> dst) {
RTC_DCHECK_EQ(samples.size(), kFrameSize20ms24kHz);
RTC_DCHECK_EQ(dst.size(), kFrameSize20ms24kHz / 2 + 1);
RTC_DCHECK_EQ(dst.size(), kFftSizeBy2Plus1);
// Apply windowing.
RTC_DCHECK_EQ(input_buf_.size(), 2 * half_window_.size());
for (size_t i = 0; i < input_buf_.size() / 2; ++i) {

View File

@ -21,32 +21,31 @@
namespace webrtc {
namespace rnn_vad {
// TODO(alessiob): Switch to PFFFT using its own wrapper.
// TODO(alessiob): Delete this class when switching to PFFFT.
// TODO(alessiob): Switch to PFFFT and remove this class.
// FFT implementation wrapper for the band-wise analysis step in which 20 ms
// frames at 24 kHz are analyzed in the frequency domain. The goal of this class
// are (i) making easy to switch to another FFT implementation, (ii) own the
// input buffer for the FFT and (iii) apply a windowing function before
// computing the FFT.
class BandAnalysisFft {
class FftUtil {
public:
BandAnalysisFft();
BandAnalysisFft(const BandAnalysisFft&) = delete;
BandAnalysisFft& operator=(const BandAnalysisFft&) = delete;
~BandAnalysisFft();
FftUtil();
FftUtil(const FftUtil&) = delete;
FftUtil& operator=(const FftUtil&) = delete;
~FftUtil();
// Applies a windowing function to |samples|, computes the real forward FFT
// and writes the result in |dst|.
// The size of |samples| must be 480 (20 ms at 24 kHz).
// The size of |dst| must be 241 since the complex conjugate is not written.
void ForwardFft(rtc::ArrayView<const float> samples,
rtc::ArrayView<std::complex<float>> dst);
void WindowedFft(rtc::ArrayView<const float> samples,
rtc::ArrayView<std::complex<float>> dst);
private:
static_assert((kFrameSize20ms24kHz & 1) == 0,
"kFrameSize20ms24kHz must be even.");
const std::array<float, kFrameSize20ms24kHz / 2> half_window_;
std::array<std::complex<float>, kFrameSize20ms24kHz> input_buf_{};
std::array<std::complex<float>, kFrameSize20ms24kHz> output_buf_{};
std::array<std::complex<float>, kFrameSize20ms24kHz> input_buf_;
std::array<std::complex<float>, kFrameSize20ms24kHz> output_buf_;
rnnoise::KissFft fft_;
};

View File

@ -39,16 +39,16 @@ std::vector<float> CreateSine(float amplitude,
} // namespace
TEST(RnnVadTest, BandAnalysisFftTest) {
TEST(RnnVadTest, FftUtilTest) {
for (float frequency_hz : {200.f, 450.f, 1500.f}) {
SCOPED_TRACE(frequency_hz);
auto x = CreateSine(
/*amplitude=*/1000.f, frequency_hz,
/*duration_s=*/0.02f,
/*sample_rate_hz=*/kSampleRate24kHz);
BandAnalysisFft analyzer;
FftUtil analyzer;
std::vector<std::complex<float>> x_fft(x.size() / 2 + 1);
analyzer.ForwardFft(x, x_fft);
analyzer.WindowedFft(x, x_fft);
int peak_fft_bin_index = std::distance(
x_fft.begin(),
std::max_element(x_fft.begin(), x_fft.end(),

View File

@ -15,7 +15,6 @@
#include <limits>
#include <numeric>
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include "rtc_base/checks.h"
namespace webrtc {
@ -24,21 +23,21 @@ namespace {
constexpr float kSilenceThreshold = 0.04f;
// Computes the new spectral difference stats and pushes them into the passed
// Computes the new cepstral difference stats and pushes them into the passed
// symmetric matrix buffer.
void UpdateSpectralDifferenceStats(
rtc::ArrayView<const float, kNumBands> new_spectral_coeffs,
const RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>& ring_buf,
SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize>* sym_matrix_buf) {
void UpdateCepstralDifferenceStats(
rtc::ArrayView<const float, kNumBands> new_cepstral_coeffs,
const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf,
SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) {
RTC_DCHECK(sym_matrix_buf);
// Compute the new spectral distance stats.
std::array<float, kSpectralCoeffsHistorySize - 1> distances;
for (size_t i = 0; i < kSpectralCoeffsHistorySize - 1; ++i) {
// Compute the new cepstral distance stats.
std::array<float, kCepstralCoeffsHistorySize - 1> distances;
for (size_t i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) {
const size_t delay = i + 1;
auto old_spectral_coeffs = ring_buf.GetArrayView(delay);
auto old_cepstral_coeffs = ring_buf.GetArrayView(delay);
distances[i] = 0.f;
for (size_t k = 0; k < kNumBands; ++k) {
const float c = new_spectral_coeffs[k] - old_spectral_coeffs[k];
const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k];
distances[i] += c * c;
}
}
@ -48,96 +47,77 @@ void UpdateSpectralDifferenceStats(
} // namespace
SpectralFeaturesView::SpectralFeaturesView(
rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative,
rtc::ArrayView<float, kNumLowerBands> cross_correlations,
float* variability)
: coeffs(coeffs),
average(average),
first_derivative(first_derivative),
second_derivative(second_derivative),
cross_correlations(cross_correlations),
variability(variability) {}
SpectralFeaturesView::SpectralFeaturesView(const SpectralFeaturesView&) =
default;
SpectralFeaturesView::~SpectralFeaturesView() = default;
SpectralFeaturesExtractor::SpectralFeaturesExtractor()
: fft_(),
reference_frame_fft_(kFrameSize20ms24kHz / 2 + 1),
lagged_frame_fft_(kFrameSize20ms24kHz / 2 + 1),
band_boundaries_(
ComputeBandBoundaryIndexes(kSampleRate24kHz, kFrameSize20ms24kHz)),
reference_frame_fft_(kFftSizeBy2Plus1),
lagged_frame_fft_(kFftSizeBy2Plus1),
dct_table_(ComputeDctTable()) {}
SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default;
void SpectralFeaturesExtractor::Reset() {
spectral_coeffs_ring_buf_.Reset();
spectral_diffs_buf_.Reset();
cepstral_coeffs_ring_buf_.Reset();
cepstral_diffs_buf_.Reset();
}
bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
SpectralFeaturesView spectral_features) {
// Analyze reference frame.
fft_.ForwardFft(reference_frame, reference_frame_fft_);
ComputeBandEnergies(reference_frame_fft_, band_boundaries_,
reference_frame_energy_coeffs_);
rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative,
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
float* variability) {
// Compute the Opus band energies for the reference frame.
fft_.WindowedFft(reference_frame, reference_frame_fft_);
spectral_correlator_.ComputeAutoCorrelation(
{reference_frame_fft_.data(), kFftSizeBy2Plus1},
reference_frame_bands_energy_);
// Check if the reference frame has silence.
const float tot_energy =
std::accumulate(reference_frame_energy_coeffs_.begin(),
reference_frame_energy_coeffs_.end(), 0.f);
if (tot_energy < kSilenceThreshold)
std::accumulate(reference_frame_bands_energy_.begin(),
reference_frame_bands_energy_.end(), 0.f);
if (tot_energy < kSilenceThreshold) {
return true;
// Analyze lagged frame.
fft_.ForwardFft(lagged_frame, lagged_frame_fft_);
ComputeBandEnergies(lagged_frame_fft_, band_boundaries_,
lagged_frame_energy_coeffs_);
}
// Compute the Opus band energies for the lagged frame.
fft_.WindowedFft(lagged_frame, lagged_frame_fft_);
spectral_correlator_.ComputeAutoCorrelation(
{lagged_frame_fft_.data(), kFftSizeBy2Plus1}, lagged_frame_bands_energy_);
// Log of the band energies for the reference frame.
std::array<float, kNumBands> log_band_energy_coeffs;
ComputeLogBandEnergiesCoefficients(reference_frame_energy_coeffs_,
log_band_energy_coeffs);
// Decorrelate band-wise log energy coefficients via DCT.
std::array<float, kNumBands> log_band_energy_coeffs_decorrelated;
ComputeDct(log_band_energy_coeffs, dct_table_,
log_band_energy_coeffs_decorrelated);
// Normalize (based on training set stats).
log_band_energy_coeffs_decorrelated[0] -= 12;
log_band_energy_coeffs_decorrelated[1] -= 4;
// Update the ring buffer and the spectral difference stats.
spectral_coeffs_ring_buf_.Push(log_band_energy_coeffs_decorrelated);
UpdateSpectralDifferenceStats(log_band_energy_coeffs_decorrelated,
spectral_coeffs_ring_buf_,
&spectral_diffs_buf_);
// Write the higher bands spectral coefficients.
auto coeffs_src = spectral_coeffs_ring_buf_.GetArrayView(0);
RTC_DCHECK_EQ(coeffs_src.size() - kNumLowerBands,
spectral_features.coeffs.size());
std::copy(coeffs_src.begin() + kNumLowerBands, coeffs_src.end(),
spectral_features.coeffs.begin());
std::array<float, kNumBands> log_bands_energy;
ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_,
log_bands_energy);
// Reference frame cepstrum.
std::array<float, kNumBands> cepstrum;
ComputeDct(log_bands_energy, dct_table_, cepstrum);
// Ad-hoc correction terms for the first two cepstral coefficients.
cepstrum[0] -= 12.f;
cepstrum[1] -= 4.f;
// Update the ring buffer and the cepstral difference stats.
cepstral_coeffs_ring_buf_.Push(cepstrum);
UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_,
&cepstral_diffs_buf_);
// Write the higher bands cepstral coefficients.
RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size());
std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(),
higher_bands_cepstrum.begin());
// Compute and write remaining features.
ComputeAvgAndDerivatives(spectral_features.average,
spectral_features.first_derivative,
spectral_features.second_derivative);
ComputeCrossCorrelation(spectral_features.cross_correlations);
RTC_DCHECK(spectral_features.variability);
*(spectral_features.variability) = ComputeVariability();
ComputeAvgAndDerivatives(average, first_derivative, second_derivative);
ComputeNormalizedCepstralCorrelation(bands_cross_corr);
RTC_DCHECK(variability);
*variability = ComputeVariability();
return false;
}
void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative) {
auto curr = spectral_coeffs_ring_buf_.GetArrayView(0);
auto prev1 = spectral_coeffs_ring_buf_.GetArrayView(1);
auto prev2 = spectral_coeffs_ring_buf_.GetArrayView(2);
rtc::ArrayView<float, kNumLowerBands> second_derivative) const {
auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0);
auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1);
auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2);
RTC_DCHECK_EQ(average.size(), first_derivative.size());
RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size());
RTC_DCHECK_LE(average.size(), curr.size());
@ -151,47 +131,41 @@ void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
}
}
void SpectralFeaturesExtractor::ComputeCrossCorrelation(
rtc::ArrayView<float, kNumLowerBands> cross_correlations) {
const auto& x = reference_frame_fft_;
const auto& y = lagged_frame_fft_;
auto cross_corr = [x, y](const size_t freq_bin_index) -> float {
return (x[freq_bin_index].real() * y[freq_bin_index].real() +
x[freq_bin_index].imag() * y[freq_bin_index].imag());
};
std::array<float, kNumBands> cross_corr_coeffs;
constexpr size_t kNumFftPoints = kFrameSize20ms24kHz / 2 + 1;
ComputeBandCoefficients(cross_corr, band_boundaries_, kNumFftPoints - 1,
cross_corr_coeffs);
void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation(
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr) {
spectral_correlator_.ComputeCrossCorrelation(
{reference_frame_fft_.data(), kFftSizeBy2Plus1},
{lagged_frame_fft_.data(), kFftSizeBy2Plus1}, bands_cross_corr_);
// Normalize.
for (size_t i = 0; i < cross_corr_coeffs.size(); ++i) {
cross_corr_coeffs[i] =
cross_corr_coeffs[i] /
std::sqrt(0.001f + reference_frame_energy_coeffs_[i] *
lagged_frame_energy_coeffs_[i]);
for (size_t i = 0; i < bands_cross_corr_.size(); ++i) {
bands_cross_corr_[i] =
bands_cross_corr_[i] /
std::sqrt(0.001f + reference_frame_bands_energy_[i] *
lagged_frame_bands_energy_[i]);
}
// Decorrelate.
ComputeDct(cross_corr_coeffs, dct_table_, cross_correlations);
// Normalize (based on training set stats).
cross_correlations[0] -= 1.3f;
cross_correlations[1] -= 0.9f;
// Cepstrum.
ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr);
// Ad-hoc correction terms for the first two cepstral coefficients.
bands_cross_corr[0] -= 1.3f;
bands_cross_corr[1] -= 0.9f;
}
float SpectralFeaturesExtractor::ComputeVariability() {
// Compute spectral variability score.
float spec_variability = 0.f;
for (size_t delay1 = 0; delay1 < kSpectralCoeffsHistorySize; ++delay1) {
float SpectralFeaturesExtractor::ComputeVariability() const {
// Compute cepstral variability score.
float variability = 0.f;
for (size_t delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) {
float min_dist = std::numeric_limits<float>::max();
for (size_t delay2 = 0; delay2 < kSpectralCoeffsHistorySize; ++delay2) {
for (size_t delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) {
if (delay1 == delay2) // The distance would be 0.
continue;
min_dist =
std::min(min_dist, spectral_diffs_buf_.GetValue(delay1, delay2));
std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2));
}
spec_variability += min_dist;
variability += min_dist;
}
// Normalize (based on training set stats).
return spec_variability / kSpectralCoeffsHistorySize - 2.1f;
// TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction.
return variability / kCepstralCoeffsHistorySize - 2.1f;
}
} // namespace rnn_vad

View File

@ -20,34 +20,12 @@
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/fft_util.h"
#include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h"
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h"
namespace webrtc {
namespace rnn_vad {
// View on spectral features.
class SpectralFeaturesView {
public:
SpectralFeaturesView(rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative,
rtc::ArrayView<float, kNumLowerBands> cross_correlations,
float* variability);
SpectralFeaturesView(const SpectralFeaturesView&);
~SpectralFeaturesView();
// Higher bands spectral coefficients.
const rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs;
// Average and first and second derivative over time for the lower bands.
const rtc::ArrayView<float, kNumLowerBands> average;
const rtc::ArrayView<float, kNumLowerBands> first_derivative;
const rtc::ArrayView<float, kNumLowerBands> second_derivative;
// Spectral cross-correlation for the lower bands.
const rtc::ArrayView<float, kNumLowerBands> cross_correlations;
// Spectral variability score.
float* const variability;
};
// Class to compute spectral features.
class SpectralFeaturesExtractor {
public:
@ -64,27 +42,33 @@ class SpectralFeaturesExtractor {
bool CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
SpectralFeaturesView spectral_features);
rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative,
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
float* variability);
private:
void ComputeAvgAndDerivatives(
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
rtc::ArrayView<float, kNumLowerBands> second_derivative);
void ComputeCrossCorrelation(
rtc::ArrayView<float, kNumLowerBands> cross_correlations);
float ComputeVariability();
rtc::ArrayView<float, kNumLowerBands> second_derivative) const;
void ComputeNormalizedCepstralCorrelation(
rtc::ArrayView<float, kNumLowerBands> bands_cross_corr);
float ComputeVariability() const;
BandAnalysisFft fft_;
FftUtil fft_;
std::vector<std::complex<float>> reference_frame_fft_;
std::vector<std::complex<float>> lagged_frame_fft_;
std::array<float, kNumBands> reference_frame_energy_coeffs_{};
std::array<float, kNumBands> lagged_frame_energy_coeffs_{};
const std::array<size_t, kNumBands> band_boundaries_;
SpectralCorrelator spectral_correlator_;
std::array<float, kOpusBands24kHz> reference_frame_bands_energy_;
std::array<float, kOpusBands24kHz> lagged_frame_bands_energy_;
std::array<float, kOpusBands24kHz> bands_cross_corr_;
const std::array<float, kNumBands * kNumBands> dct_table_;
RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>
spectral_coeffs_ring_buf_;
SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize> spectral_diffs_buf_;
RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>
cepstral_coeffs_ring_buf_;
SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize> cepstral_diffs_buf_;
};
} // namespace rnn_vad

View File

@ -20,85 +20,126 @@ namespace webrtc {
namespace rnn_vad {
namespace {
// DCT scaling factor.
static_assert(
kNumBands == 22,
"kNumBands changed! Please update the value of kDctScalingFactor");
constexpr float kDctScalingFactor = 0.301511345f; // sqrt(2 / kNumBands)
// Weights for each FFT coefficient for each Opus band (Nyquist frequency
// excluded). The size of each band is specified in
// |kOpusScaleNumBins24kHz20ms|.
constexpr std::array<float, kFrameSize20ms24kHz / 2> kOpusBandWeights24kHz20ms =
{{
0.f, 0.25f, 0.5f, 0.75f, // Band 0
0.f, 0.25f, 0.5f, 0.75f, // Band 1
0.f, 0.25f, 0.5f, 0.75f, // Band 2
0.f, 0.25f, 0.5f, 0.75f, // Band 3
0.f, 0.25f, 0.5f, 0.75f, // Band 4
0.f, 0.25f, 0.5f, 0.75f, // Band 5
0.f, 0.25f, 0.5f, 0.75f, // Band 6
0.f, 0.25f, 0.5f, 0.75f, // Band 7
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 8
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 9
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 10
0.f, 0.125f, 0.25f, 0.375f, 0.5f,
0.625f, 0.75f, 0.875f, // Band 11
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
0.9375f, // Band 12
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
0.9375f, // Band 13
0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
0.9375f, // Band 14
0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 15
0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 16
0.f, 0.03125f, 0.0625f, 0.09375f, 0.125f,
0.15625f, 0.1875f, 0.21875f, 0.25f, 0.28125f,
0.3125f, 0.34375f, 0.375f, 0.40625f, 0.4375f,
0.46875f, 0.5f, 0.53125f, 0.5625f, 0.59375f,
0.625f, 0.65625f, 0.6875f, 0.71875f, 0.75f,
0.78125f, 0.8125f, 0.84375f, 0.875f, 0.90625f,
0.9375f, 0.96875f, // Band 17
0.f, 0.0208333f, 0.0416667f, 0.0625f, 0.0833333f,
0.104167f, 0.125f, 0.145833f, 0.166667f, 0.1875f,
0.208333f, 0.229167f, 0.25f, 0.270833f, 0.291667f,
0.3125f, 0.333333f, 0.354167f, 0.375f, 0.395833f,
0.416667f, 0.4375f, 0.458333f, 0.479167f, 0.5f,
0.520833f, 0.541667f, 0.5625f, 0.583333f, 0.604167f,
0.625f, 0.645833f, 0.666667f, 0.6875f, 0.708333f,
0.729167f, 0.75f, 0.770833f, 0.791667f, 0.8125f,
0.833333f, 0.854167f, 0.875f, 0.895833f, 0.916667f,
0.9375f, 0.958333f, 0.979167f // Band 18
}};
} // namespace
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
size_t sample_rate_hz,
size_t frame_size_samples) {
std::array<size_t, kNumBands> indexes;
for (size_t i = 0; i < kNumBands; ++i) {
indexes[i] =
kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz;
}
return indexes;
SpectralCorrelator::SpectralCorrelator()
: weights_(kOpusBandWeights24kHz20ms.begin(),
kOpusBandWeights24kHz20ms.end()) {}
SpectralCorrelator::~SpectralCorrelator() = default;
void SpectralCorrelator::ComputeAutoCorrelation(
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const {
ComputeCrossCorrelation(x, x, auto_corr);
}
void ComputeBandCoefficients(
rtc::FunctionView<float(size_t)> functor,
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
size_t max_freq_bin_index,
rtc::ArrayView<float, kNumBands> coefficients) {
std::fill(coefficients.begin(), coefficients.end(), 0.f);
for (size_t i = 0; i < coefficients.size() - 1; ++i) {
RTC_DCHECK_EQ(0.f, coefficients[i + 1]);
RTC_DCHECK_GT(band_boundaries[i + 1], band_boundaries[i]);
const size_t first_freq_bin = band_boundaries[i];
const size_t last_freq_bin =
std::min(max_freq_bin_index, first_freq_bin + band_boundaries[i + 1] -
band_boundaries[i] - 1);
// Depending on the sample rate, the highest bands can have no FFT
// coefficients. Stop the iteration when coming across the first empty band.
if (first_freq_bin >= last_freq_bin)
break;
const size_t band_size = last_freq_bin - first_freq_bin + 1;
// Compute the band coefficient using a triangular band with peak response
// at the band boundary.
for (size_t j = first_freq_bin; j <= last_freq_bin; ++j) {
const float w = static_cast<float>(j - first_freq_bin) / band_size;
const float coefficient = functor(j);
coefficients[i] += (1.f - w) * coefficient;
coefficients[i + 1] += w * coefficient;
void SpectralCorrelator::ComputeCrossCorrelation(
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> y,
rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const {
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
size_t k = 0; // Next Fourier coefficient index.
cross_corr[0] = 0.f;
for (size_t i = 0; i < kOpusBands24kHz - 1; ++i) {
cross_corr[i + 1] = 0.f;
for (int j = 0; j < kOpusScaleNumBins24kHz20ms[i]; ++j) { // Band size.
const float v = x[k].real() * y[k].real() + x[k].imag() * y[k].imag();
const float tmp = weights_[k] * v;
cross_corr[i] += v - tmp;
cross_corr[i + 1] += tmp;
k++;
}
}
// The first and the last bands in the loop above only got half contribution.
coefficients[0] *= 2.f;
coefficients[coefficients.size() - 1] *= 2.f;
// TODO(bugs.webrtc.org/9076): Replace the line above with
// "coefficients[i] *= 2.f" (*) since we now assume that the last band is
// always |kNumBands| - 1.
// (*): "size_t i" must be declared before the main loop.
cross_corr[0] *= 2.f; // The first band only gets half contribution.
// The Nyquist coefficient is never used.
RTC_DCHECK_EQ(k, kFftSizeBy2Plus1 - 1);
}
void ComputeBandEnergies(
rtc::ArrayView<const std::complex<float>> fft_coeffs,
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
rtc::ArrayView<float, kNumBands> band_energies) {
RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size());
auto functor = [fft_coeffs](const size_t freq_bin_index) -> float {
return std::norm(fft_coeffs[freq_bin_index]);
void ComputeSmoothedLogMagnitudeSpectrum(
rtc::ArrayView<const float> bands_energy,
rtc::ArrayView<float, kNumBands> log_bands_energy) {
RTC_DCHECK_LE(bands_energy.size(), kNumBands);
constexpr float kOneByHundred = 1e-2f;
constexpr float kLogOneByHundred = -2.f;
// Init.
float log_max = kLogOneByHundred;
float follow = kLogOneByHundred;
const auto smooth = [&log_max, &follow](float x) {
x = std::max(log_max - 7.f, std::max(follow - 1.5f, x));
log_max = std::max(log_max, x);
follow = std::max(follow - 1.5f, x);
return x;
};
ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1,
band_energies);
}
void ComputeLogBandEnergiesCoefficients(
rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
rtc::ArrayView<float, kNumBands> log_band_energy_coeffs) {
float log_max = -2.f;
float follow = -2.f;
for (size_t i = 0; i < band_energy_coeffs.size(); ++i) {
log_band_energy_coeffs[i] = std::log10(1e-2f + band_energy_coeffs[i]);
// Smoothing across frequency bands.
log_band_energy_coeffs[i] = std::max(
log_max - 7.f, std::max(follow - 1.5f, log_band_energy_coeffs[i]));
log_max = std::max(log_max, log_band_energy_coeffs[i]);
follow = std::max(follow - 1.5f, log_band_energy_coeffs[i]);
// Smoothing over the bands for which the band energy is defined.
for (size_t i = 0; i < bands_energy.size(); ++i) {
log_bands_energy[i] = smooth(std::log10(kOneByHundred + bands_energy[i]));
}
// Smoothing over the remaining bands (zero energy).
for (size_t i = bands_energy.size(); i < kNumBands; ++i) {
log_bands_energy[i] = smooth(kLogOneByHundred);
}
}
@ -113,17 +154,28 @@ std::array<float, kNumBands * kNumBands> ComputeDctTable() {
return dct_table;
}
void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
void ComputeDct(rtc::ArrayView<const float> in,
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
rtc::ArrayView<float> out) {
// DCT scaling factor - i.e., sqrt(2 / kNumBands).
constexpr float kDctScalingFactor = 0.301511345f;
constexpr float kDctScalingFactorError =
kDctScalingFactor * kDctScalingFactor -
2.f / static_cast<float>(kNumBands);
static_assert(
(kDctScalingFactorError >= 0.f && kDctScalingFactorError < 1e-1f) ||
(kDctScalingFactorError < 0.f && kDctScalingFactorError > -1e-1f),
"kNumBands changed and kDctScalingFactor has not been updated.");
RTC_DCHECK_NE(in.data(), out.data()) << "In-place DCT is not supported.";
RTC_DCHECK_LE(in.size(), kNumBands);
RTC_DCHECK_LE(1, out.size());
RTC_DCHECK_LE(out.size(), in.size());
std::fill(out.begin(), out.end(), 0.f);
for (size_t i = 0; i < out.size(); ++i) {
out[i] = 0.f;
for (size_t j = 0; j < in.size(); ++j) {
out[i] += in[j] * dct_table[j * in.size() + i];
out[i] += in[j] * dct_table[j * kNumBands + i];
}
// TODO(bugs.webrtc.org/10480): Scaling factor in the DCT table.
out[i] *= kDctScalingFactor;
}
}

View File

@ -14,49 +14,75 @@
#include <stddef.h>
#include <array>
#include <complex>
#include <vector>
#include "api/array_view.h"
#include "api/function_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
namespace webrtc {
namespace rnn_vad {
// Computes FFT boundary indexes corresponding to sub-bands.
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
size_t sample_rate_hz,
size_t frame_size_samples);
// At a sample rate of 24 kHz, the last 3 Opus bands are beyond the Nyquist
// frequency. However, band #19 gets the contributions from band #18 because
// of the symmetric triangular filter with peak response at 12 kHz.
constexpr size_t kOpusBands24kHz = 20;
static_assert(kOpusBands24kHz < kNumBands,
"The number of bands at 24 kHz must be less than those defined "
"in the Opus scale at 48 kHz.");
// Iterates through frequency bands and computes coefficients via |functor| for
// triangular bands with peak response at each band boundary. |functor| returns
// a floating point value for the FFT coefficient having index equal to the
// argument passed to |functor|; that argument is in the range {0, ...
// |max_freq_bin_index| - 1}.
void ComputeBandCoefficients(
rtc::FunctionView<float(size_t)> functor,
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
const size_t max_freq_bin_index,
rtc::ArrayView<float, kNumBands> coefficients);
// Number of FFT frequency bins covered by each band in the Opus scale at a
// sample rate of 24 kHz for 20 ms frames.
// Declared here for unit testing.
constexpr std::array<int, kOpusBands24kHz - 1> GetOpusScaleNumBins24kHz20ms() {
return {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 24, 24, 32, 48};
}
// Given an array of FFT coefficients and a vector of band boundary indexes,
// computes band energy coefficients.
void ComputeBandEnergies(
rtc::ArrayView<const std::complex<float>> fft_coeffs,
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
rtc::ArrayView<float, kNumBands> band_energies);
// TODO(bugs.webrtc.org/10480): Move to a separate file.
// Class to compute band-wise spectral features in the Opus perceptual scale
// for 20 ms frames sampled at 24 kHz. The analysis methods apply triangular
// filters with peak response at the each band boundary.
class SpectralCorrelator {
public:
// Ctor.
SpectralCorrelator();
SpectralCorrelator(const SpectralCorrelator&) = delete;
SpectralCorrelator& operator=(const SpectralCorrelator&) = delete;
~SpectralCorrelator();
// Computes log band energy coefficients.
void ComputeLogBandEnergiesCoefficients(
rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
rtc::ArrayView<float, kNumBands> log_band_energy_coeffs);
// Computes the band-wise spectral auto-correlations.
void ComputeAutoCorrelation(
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const;
// Creates a DCT table for arrays having size equal to |kNumBands|.
// Computes the band-wise spectral cross-correlations.
void ComputeCrossCorrelation(
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> y,
rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const;
private:
const std::vector<float> weights_; // Weights for each Fourier coefficient.
};
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
// spectral_features.cc. Given a vector of Opus-bands energy coefficients,
// computes the log magnitude spectrum applying smoothing both over time and
// over frequency. Declared here for unit testing.
void ComputeSmoothedLogMagnitudeSpectrum(
rtc::ArrayView<const float> bands_energy,
rtc::ArrayView<float, kNumBands> log_bands_energy);
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
// spectral_features.cc. Creates a DCT table for arrays having size equal to
// |kNumBands|. Declared here for unit testing.
std::array<float, kNumBands * kNumBands> ComputeDctTable();
// Computes DCT for |in| given a pre-computed DCT table. In-place computation is
// not allowed and |out| can be smaller than |in| in order to only compute the
// first DCT coefficients.
void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
// spectral_features.cc. Computes DCT for |in| given a pre-computed DCT table.
// In-place computation is not allowed and |out| can be smaller than |in| in
// order to only compute the first DCT coefficients. Declared here for unit
// testing.
void ComputeDct(rtc::ArrayView<const float> in,
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
rtc::ArrayView<float> out);

View File

@ -10,6 +10,13 @@
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include <algorithm>
#include <array>
#include <complex>
#include <numeric>
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
// #include "test/fpe_observer.h"
@ -20,58 +27,76 @@ namespace rnn_vad {
namespace test {
namespace {
constexpr size_t kSampleRate48kHz = 48000;
constexpr size_t kFrameSize20ms48kHz = 2 * kSampleRate48kHz / 100;
constexpr size_t kFftNumCoeffs20ms48kHz = kFrameSize20ms48kHz / 2 + 1;
// Generates the values for the array named |kOpusBandWeights24kHz20ms| in the
// anonymous namespace of the .cc file, which is the array of FFT coefficient
// weights for the Opus scale triangular filters.
std::vector<float> ComputeTriangularFiltersWeights() {
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
const auto& v = kOpusScaleNumBins24kHz20ms; // Alias.
const size_t num_weights = std::accumulate(
kOpusScaleNumBins24kHz20ms.begin(), kOpusScaleNumBins24kHz20ms.end(), 0);
std::vector<float> weights(num_weights);
size_t next_fft_coeff_index = 0;
for (size_t band = 0; band < v.size(); ++band) {
const size_t band_size = v[band];
for (size_t j = 0; j < band_size; ++j) {
weights[next_fft_coeff_index + j] = static_cast<float>(j) / band_size;
}
next_fft_coeff_index += band_size;
}
return weights;
}
} // namespace
// TODO(bugs.webrtc.org/9076): Remove this test before closing the issue.
// Check that when using precomputed FFT coefficients for frames at 48 kHz, the
// output of ComputeBandEnergies() is bit exact.
TEST(RnnVadTest, ComputeBandEnergies48kHzBitExactness) {
// Initialize input data reader and buffers.
auto fft_coeffs_reader = CreateFftCoeffsReader();
const size_t num_frames = fft_coeffs_reader.second;
ASSERT_EQ(
kFftNumCoeffs20ms48kHz,
rtc::CheckedDivExact(fft_coeffs_reader.first->data_length(), num_frames) /
2);
std::array<float, kFftNumCoeffs20ms48kHz> fft_coeffs_real;
std::array<float, kFftNumCoeffs20ms48kHz> fft_coeffs_imag;
std::array<std::complex<float>, kFftNumCoeffs20ms48kHz> fft_coeffs;
// Init expected output reader and buffer.
auto band_energies_reader = CreateBandEnergyCoeffsReader();
ASSERT_EQ(num_frames, band_energies_reader.second);
std::array<float, kNumBands> expected_band_energies;
// Init band energies coefficients computation.
const auto band_boundary_indexes =
ComputeBandBoundaryIndexes(kSampleRate48kHz, kFrameSize20ms48kHz);
std::array<float, kNumBands> computed_band_energies;
// Check output for every frame.
{
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
// FloatingPointExceptionObserver fpe_observer;
for (size_t i = 0; i < num_frames; ++i) {
SCOPED_TRACE(i);
// Read input.
fft_coeffs_reader.first->ReadChunk(fft_coeffs_real);
fft_coeffs_reader.first->ReadChunk(fft_coeffs_imag);
for (size_t i = 0; i < kFftNumCoeffs20ms48kHz; ++i) {
fft_coeffs[i].real(fft_coeffs_real[i]);
fft_coeffs[i].imag(fft_coeffs_imag[i]);
}
band_energies_reader.first->ReadChunk(expected_band_energies);
// Compute band energy coefficients and check output.
ComputeBandEnergies(fft_coeffs, band_boundary_indexes,
computed_band_energies);
ExpectEqualFloatArray(expected_band_energies, computed_band_energies);
}
// Checks that the values returned by GetOpusScaleNumBins24kHz20ms() match the
// Opus scale frequency boundaries.
TEST(RnnVadTest, TestOpusScaleBoundaries) {
constexpr int kBandFrequencyBoundariesHz[kNumBands - 1] = {
200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400, 2800,
3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
int prev = 0;
for (size_t i = 0; i < kOpusScaleNumBins24kHz20ms.size(); ++i) {
int boundary =
kBandFrequencyBoundariesHz[i] * kFrameSize20ms24kHz / kSampleRate24kHz;
EXPECT_EQ(kOpusScaleNumBins24kHz20ms[i], boundary - prev);
prev = boundary;
}
}
TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) {
// Checks that the computed triangular filters weights for the Opus scale are
// monotonic withing each Opus band. This test should only be enabled when
// ComputeTriangularFiltersWeights() is changed and |kOpusBandWeights24kHz20ms|
// is updated accordingly.
TEST(RnnVadTest, DISABLED_TestOpusScaleWeights) {
auto weights = ComputeTriangularFiltersWeights();
size_t i = 0;
for (size_t band_size : GetOpusScaleNumBins24kHz20ms()) {
SCOPED_TRACE(band_size);
rtc::ArrayView<float> band_weights(weights.data() + i, band_size);
float prev = -1.f;
for (float weight : band_weights) {
EXPECT_LT(prev, weight);
prev = weight;
}
i += band_size;
}
}
TEST(RnnVadTest, SpectralCorrelatorValidOutput) {
SpectralCorrelator e;
std::array<std::complex<float>, kFftSizeBy2Plus1> in;
std::array<float, kOpusBands24kHz> out;
in.fill({1.f, 1.f});
e.ComputeAutoCorrelation(in, out);
for (size_t i = 0; i < kOpusBands24kHz; ++i) {
SCOPED_TRACE(i);
EXPECT_GT(out[i], 0.f);
}
}
TEST(RnnVadTest, ComputeSmoothedLogMagnitudeSpectrumWithinTolerance) {
constexpr std::array<float, kNumBands> input = {
{86.060539245605f, 275.668334960938f, 43.406528472900f, 6.541896820068f,
17.964015960693f, 8.090919494629f, 1.261920094490f, 1.212702631950f,
@ -90,7 +115,7 @@ TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) {
{
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
// FloatingPointExceptionObserver fpe_observer;
ComputeLogBandEnergiesCoefficients(input, computed_output);
ComputeSmoothedLogMagnitudeSpectrum(input, computed_output);
ExpectNearAbsolute(expected_output, computed_output, 1e-5f);
}
}

View File

@ -32,15 +32,35 @@ void WriteTestData(rtc::ArrayView<float> samples) {
}
}
SpectralFeaturesView GetSpectralFeaturesView(
rtc::ArrayView<float, kNumBands - kNumLowerBands> GetHigherBandsSpectrum(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
return {
{feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands},
{feature_vector->data(), kNumLowerBands},
{feature_vector->data() + kNumBands, kNumLowerBands},
{feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands},
{feature_vector->data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
&(*feature_vector)[kNumBands + 3 * kNumLowerBands]};
return {feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands};
}
rtc::ArrayView<float, kNumLowerBands> GetAverage(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
return {feature_vector->data(), kNumLowerBands};
}
rtc::ArrayView<float, kNumLowerBands> GetFirstDerivative(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
return {feature_vector->data() + kNumBands, kNumLowerBands};
}
rtc::ArrayView<float, kNumLowerBands> GetSecondDerivative(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
return {feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands};
}
rtc::ArrayView<float, kNumLowerBands> GetCepstralCrossCorrelation(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
return {feature_vector->data() + kNumBands + 2 * kNumLowerBands,
kNumLowerBands};
}
float* GetCepstralVariability(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
return feature_vector->data() + kNumBands + 3 * kNumLowerBands;
}
constexpr float kInitialFeatureVal = -9999.f;
@ -54,7 +74,6 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
rtc::ArrayView<float, kFrameSize20ms24kHz> samples_view(samples);
bool is_silence;
std::array<float, kTestFeatureVectorSize> feature_vector;
auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
// Write an initial value in the feature vector to detect changes.
std::fill(feature_vector.begin(), feature_vector.end(), kInitialFeatureVal);
@ -64,8 +83,12 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
// With silence.
std::fill(samples.begin(), samples.end(), 0.f);
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
feature_vector_view);
is_silence = sfe.CheckSilenceComputeFeatures(
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
GetSecondDerivative(&feature_vector),
GetCepstralCrossCorrelation(&feature_vector),
GetCepstralVariability(&feature_vector));
// Silence is expected, the output won't be overwritten.
EXPECT_TRUE(is_silence);
EXPECT_TRUE(std::all_of(feature_vector.begin(), feature_vector.end(),
@ -73,18 +96,22 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
// With no silence.
WriteTestData(samples);
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
feature_vector_view);
is_silence = sfe.CheckSilenceComputeFeatures(
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
GetSecondDerivative(&feature_vector),
GetCepstralCrossCorrelation(&feature_vector),
GetCepstralVariability(&feature_vector));
// Silence is not expected, the output will be overwritten.
EXPECT_FALSE(is_silence);
EXPECT_FALSE(std::all_of(feature_vector.begin(), feature_vector.end(),
[](float x) { return x == kInitialFeatureVal; }));
}
// When the input signal does not change, the spectral coefficients average does
// not change and the derivatives are zero. Similarly, the spectral variability
// When the input signal does not change, the cepstral coefficients average does
// not change and the derivatives are zero. Similarly, the cepstral variability
// score does not change either.
TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
TEST(RnnVadTest, CepstralFeaturesConstantAverageZeroDerivative) {
// Initialize.
SpectralFeaturesExtractor sfe;
std::array<float, kFrameSize20ms24kHz> samples;
@ -94,17 +121,24 @@ TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
// Fill the spectral features with test data.
std::array<float, kTestFeatureVectorSize> feature_vector;
auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
for (size_t i = 0; i < kSpectralCoeffsHistorySize; ++i) {
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
feature_vector_view);
for (size_t i = 0; i < kCepstralCoeffsHistorySize; ++i) {
is_silence = sfe.CheckSilenceComputeFeatures(
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
GetSecondDerivative(&feature_vector),
GetCepstralCrossCorrelation(&feature_vector),
GetCepstralVariability(&feature_vector));
}
// Feed the test data one last time but using a different output vector.
std::array<float, kTestFeatureVectorSize> feature_vector_last;
auto feature_vector_last_view = GetSpectralFeaturesView(&feature_vector_last);
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
feature_vector_last_view);
is_silence = sfe.CheckSilenceComputeFeatures(
samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector_last),
GetAverage(&feature_vector_last),
GetFirstDerivative(&feature_vector_last),
GetSecondDerivative(&feature_vector_last),
GetCepstralCrossCorrelation(&feature_vector_last),
GetCepstralVariability(&feature_vector_last));
// Average is unchanged.
ExpectEqualFloatArray({feature_vector.data(), kNumLowerBands},
@ -116,7 +150,7 @@ TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
ExpectEqualFloatArray(
{feature_vector_last.data() + kNumBands + kNumLowerBands, kNumLowerBands},
zeros);
// Spectral variability is unchanged.
// Variability is unchanged.
EXPECT_FLOAT_EQ(feature_vector[kNumBands + 3 * kNumLowerBands],
feature_vector_last[kNumBands + 3 * kNumLowerBands]);
}

View File

@ -87,14 +87,6 @@ ReaderPairType CreateFftCoeffsReader() {
return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), row_size)};
}
ReaderPairType CreateBandEnergyCoeffsReader() {
constexpr size_t num_bands = 22;
auto ptr = absl::make_unique<BinaryFileReader<float>>(
test::ResourcePath("audio_processing/agc2/rnn_vad/band_energies", "dat"),
num_bands);
return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), num_bands)};
}
ReaderPairType CreateSilenceFlagsFeatureMatrixReader() {
constexpr size_t feature_vector_size = 42;
auto ptr = absl::make_unique<BinaryFileReader<float>>(

View File

@ -110,9 +110,6 @@ CreateLpResidualAndPitchPeriodGainReader();
// Creates a reader for the FFT coefficients.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateFftCoeffsReader();
// Instance a reader for the band energy coefficients.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateBandEnergyCoeffsReader();
// Creates a reader for the silence flags and the feature matrix.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateSilenceFlagsFeatureMatrixReader();