Revert "Reland "AGC2 RNN VAD: Recurrent Neural Network impl""
This reverts commit e0bba68edea74ca33f4c492eba290c089f233f6b. Reason for revert: <INSERT REASONING HERE> Original change's description: > Reland "AGC2 RNN VAD: Recurrent Neural Network impl" > > This reverts commit 97e349ace7a3fd64fff270f0d780e02bb708f503. > > Reason for revert: downstream projects fixed > > Original change's description: > > Revert "AGC2 RNN VAD: Recurrent Neural Network impl" > > > > This reverts commit 2491cb73820fe82923b848dfcab6772b4b0addb0. > > > > Reason for revert: broke internal build > > > > Original change's description: > > > AGC2 RNN VAD: Recurrent Neural Network impl > > > > > > RNN implementation for the AGC2 VAD that includes a fully connected > > > layer and a gated recurrent unit layer. > > > > > > Bug: webrtc:9076 > > > Change-Id: Ibb8b0b4e9213f09eb9dbe118bbdc94d7e8e4f91b > > > Reviewed-on: https://webrtc-review.googlesource.com/72060 > > > Reviewed-by: Patrik Höglund <phoglund@webrtc.org> > > > Reviewed-by: Alex Loiko <aleloi@webrtc.org> > > > Reviewed-by: Ivo Creusen <ivoc@webrtc.org> > > > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > > > Cr-Commit-Position: refs/heads/master@{#23101} > > > > TBR=phoglund@webrtc.org,alessiob@webrtc.org,aleloi@webrtc.org,ivoc@webrtc.org > > > > Change-Id: Ic311c4b7d79094e959d3a2c4a53c398f34c954e2 > > No-Presubmit: true > > No-Tree-Checks: true > > No-Try: true > > Bug: webrtc:9076 > > Reviewed-on: https://webrtc-review.googlesource.com/74200 > > Reviewed-by: Sam Zackrisson <saza@webrtc.org> > > Commit-Queue: Sam Zackrisson <saza@webrtc.org> > > Cr-Commit-Position: refs/heads/master@{#23103} > > TBR=phoglund@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,aleloi@webrtc.org,ivoc@webrtc.org > > Change-Id: I0c7f8e0f59be926322d05b1da1d4d19c0777dab2 > No-Presubmit: true > No-Tree-Checks: true > No-Try: true > Bug: webrtc:9076 > Reviewed-on: https://webrtc-review.googlesource.com/74460 > Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > Cr-Commit-Position: refs/heads/master@{#23113} TBR=phoglund@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,aleloi@webrtc.org,ivoc@webrtc.org Change-Id: I3985a6d38df1d4438a50d031bc9f6cf41eb83121 No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: webrtc:9076 Reviewed-on: https://webrtc-review.googlesource.com/74560 Reviewed-by: Sam Zackrisson <saza@webrtc.org> Commit-Queue: Sam Zackrisson <saza@webrtc.org> Cr-Commit-Position: refs/heads/master@{#23117}
This commit is contained in:

committed by
Commit Bot

parent
bf51de83b8
commit
3c9f47434f
@ -25,15 +25,12 @@ source_set("lib") {
|
|||||||
"pitch_search_internal.cc",
|
"pitch_search_internal.cc",
|
||||||
"pitch_search_internal.h",
|
"pitch_search_internal.h",
|
||||||
"ring_buffer.h",
|
"ring_buffer.h",
|
||||||
"rnn.cc",
|
|
||||||
"rnn.h",
|
|
||||||
"sequence_buffer.h",
|
"sequence_buffer.h",
|
||||||
"symmetric_matrix_buffer.h",
|
"symmetric_matrix_buffer.h",
|
||||||
]
|
]
|
||||||
deps = [
|
deps = [
|
||||||
"../../../../api:array_view",
|
"../../../../api:array_view",
|
||||||
"../../../../rtc_base:checks",
|
"../../../../rtc_base:checks",
|
||||||
"//third_party/rnnoise:rnn_vad",
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -56,8 +53,6 @@ if (rtc_include_tests) {
|
|||||||
unittest_resources = [
|
unittest_resources = [
|
||||||
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_buf_24k.dat",
|
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_buf_24k.dat",
|
||||||
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_lp_res.dat",
|
"../../../../resources/audio_processing/agc2/rnn_vad/pitch_lp_res.dat",
|
||||||
"../../../../resources/audio_processing/agc2/rnn_vad/sil_features.dat",
|
|
||||||
"../../../../resources/audio_processing/agc2/rnn_vad/vad_prob.dat",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
if (is_ios) {
|
if (is_ios) {
|
||||||
@ -77,7 +72,6 @@ if (rtc_include_tests) {
|
|||||||
"pitch_search_internal_unittest.cc",
|
"pitch_search_internal_unittest.cc",
|
||||||
"pitch_search_unittest.cc",
|
"pitch_search_unittest.cc",
|
||||||
"ring_buffer_unittest.cc",
|
"ring_buffer_unittest.cc",
|
||||||
"rnn_unittest.cc",
|
|
||||||
"sequence_buffer_unittest.cc",
|
"sequence_buffer_unittest.cc",
|
||||||
"symmetric_matrix_buffer_unittest.cc",
|
"symmetric_matrix_buffer_unittest.cc",
|
||||||
]
|
]
|
||||||
@ -85,9 +79,7 @@ if (rtc_include_tests) {
|
|||||||
":lib",
|
":lib",
|
||||||
":lib_test",
|
":lib_test",
|
||||||
"../../../../api:array_view",
|
"../../../../api:array_view",
|
||||||
"../../../../rtc_base:checks",
|
|
||||||
"../../../../test:test_support",
|
"../../../../test:test_support",
|
||||||
"//third_party/rnnoise:rnn_vad",
|
|
||||||
]
|
]
|
||||||
data = unittest_resources
|
data = unittest_resources
|
||||||
if (is_ios) {
|
if (is_ios) {
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
include_rules = [
|
|
||||||
"+third_party/rnnoise",
|
|
||||||
]
|
|
@ -43,8 +43,6 @@ constexpr size_t kMaxPitch12kHz = kMaxPitch24kHz / 2;
|
|||||||
constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2;
|
constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2;
|
||||||
constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2;
|
constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2;
|
||||||
|
|
||||||
constexpr size_t kFeatureVectorSize = 42;
|
|
||||||
|
|
||||||
} // namespace rnn_vad
|
} // namespace rnn_vad
|
||||||
} // namespace webrtc
|
} // namespace webrtc
|
||||||
|
|
||||||
|
@ -1,227 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <array>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
#include "rtc_base/checks.h"
|
|
||||||
#include "third_party/rnnoise/src/rnn_activations.h"
|
|
||||||
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
namespace rnn_vad {
|
|
||||||
|
|
||||||
using rnnoise::kWeightsScale;
|
|
||||||
|
|
||||||
using rnnoise::kInputLayerInputSize;
|
|
||||||
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
|
|
||||||
using rnnoise::kInputDenseWeights;
|
|
||||||
using rnnoise::kInputDenseBias;
|
|
||||||
using rnnoise::kInputLayerOutputSize;
|
|
||||||
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
|
||||||
"Increase kFullyConnectedLayersMaxUnits.");
|
|
||||||
|
|
||||||
using rnnoise::kHiddenGruRecurrentWeights;
|
|
||||||
using rnnoise::kHiddenGruWeights;
|
|
||||||
using rnnoise::kHiddenGruBias;
|
|
||||||
using rnnoise::kHiddenLayerOutputSize;
|
|
||||||
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
|
|
||||||
"Increase kRecurrentLayersMaxUnits.");
|
|
||||||
|
|
||||||
using rnnoise::kOutputDenseWeights;
|
|
||||||
using rnnoise::kOutputDenseBias;
|
|
||||||
using rnnoise::kOutputLayerOutputSize;
|
|
||||||
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
|
||||||
"Increase kFullyConnectedLayersMaxUnits.");
|
|
||||||
|
|
||||||
using rnnoise::RectifiedLinearUnit;
|
|
||||||
using rnnoise::SigmoidApproximated;
|
|
||||||
using rnnoise::TansigApproximated;
|
|
||||||
|
|
||||||
FullyConnectedLayer::FullyConnectedLayer(
|
|
||||||
const size_t input_size,
|
|
||||||
const size_t output_size,
|
|
||||||
const rtc::ArrayView<const int8_t> bias,
|
|
||||||
const rtc::ArrayView<const int8_t> weights,
|
|
||||||
float (*const activation_function)(float))
|
|
||||||
: input_size_(input_size),
|
|
||||||
output_size_(output_size),
|
|
||||||
bias_(bias),
|
|
||||||
weights_(weights),
|
|
||||||
activation_function_(activation_function) {
|
|
||||||
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
|
|
||||||
<< "Static over-allocation of fully-connected layers output vectors is "
|
|
||||||
"not sufficient.";
|
|
||||||
RTC_DCHECK_EQ(output_size_, bias_.size())
|
|
||||||
<< "Mismatching output size and bias terms array size.";
|
|
||||||
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
|
||||||
<< "Mismatching input-output size and weight coefficients array size.";
|
|
||||||
}
|
|
||||||
|
|
||||||
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
|
||||||
|
|
||||||
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
|
|
||||||
return rtc::ArrayView<const float>(output_.data(), output_size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
|
||||||
// TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
|
|
||||||
// operations.
|
|
||||||
for (size_t o = 0; o < output_size_; ++o) {
|
|
||||||
output_[o] = bias_[o];
|
|
||||||
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
|
||||||
// |weights_| change the performance across different platforms.
|
|
||||||
for (size_t i = 0; i < input_size_; ++i) {
|
|
||||||
output_[o] += input[i] * weights_[i * output_size_ + o];
|
|
||||||
}
|
|
||||||
output_[o] = (*activation_function_)(kWeightsScale * output_[o]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
GatedRecurrentLayer::GatedRecurrentLayer(
|
|
||||||
const size_t input_size,
|
|
||||||
const size_t output_size,
|
|
||||||
const rtc::ArrayView<const int8_t> bias,
|
|
||||||
const rtc::ArrayView<const int8_t> weights,
|
|
||||||
const rtc::ArrayView<const int8_t> recurrent_weights,
|
|
||||||
float (*const activation_function)(float))
|
|
||||||
: input_size_(input_size),
|
|
||||||
output_size_(output_size),
|
|
||||||
bias_(bias),
|
|
||||||
weights_(weights),
|
|
||||||
recurrent_weights_(recurrent_weights),
|
|
||||||
activation_function_(activation_function) {
|
|
||||||
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
|
|
||||||
<< "Static over-allocation of recurrent layers state vectors is not "
|
|
||||||
<< "sufficient.";
|
|
||||||
RTC_DCHECK_EQ(3 * output_size_, bias_.size())
|
|
||||||
<< "Mismatching output size and bias terms array size.";
|
|
||||||
RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
|
|
||||||
<< "Mismatching input-output size and weight coefficients array size.";
|
|
||||||
RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
|
|
||||||
<< "Mismatching input-output size and recurrent weight coefficients array"
|
|
||||||
<< " size.";
|
|
||||||
Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
|
|
||||||
|
|
||||||
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
|
|
||||||
return rtc::ArrayView<const float>(state_.data(), output_size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GatedRecurrentLayer::Reset() {
|
|
||||||
state_.fill(0.f);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
|
||||||
// TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
|
|
||||||
// operations.
|
|
||||||
// Stride and offset used to read parameter arrays.
|
|
||||||
const size_t stride = 3 * output_size_;
|
|
||||||
size_t offset = 0;
|
|
||||||
|
|
||||||
// Compute update gates.
|
|
||||||
std::array<float, kRecurrentLayersMaxUnits> update;
|
|
||||||
for (size_t o = 0; o < output_size_; ++o) {
|
|
||||||
update[o] = bias_[o];
|
|
||||||
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
|
||||||
// |weights_| and |recurrent_weights_| change the performance across
|
|
||||||
// different platforms.
|
|
||||||
for (size_t i = 0; i < input_size_; ++i) { // Add input.
|
|
||||||
update[o] += input[i] * weights_[i * stride + o];
|
|
||||||
}
|
|
||||||
for (size_t s = 0; s < output_size_; ++s) {
|
|
||||||
update[o] += state_[s] * recurrent_weights_[s * stride + o];
|
|
||||||
} // Add state.
|
|
||||||
update[o] = SigmoidApproximated(kWeightsScale * update[o]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute reset gates.
|
|
||||||
offset += output_size_;
|
|
||||||
std::array<float, kRecurrentLayersMaxUnits> reset;
|
|
||||||
for (size_t o = 0; o < output_size_; ++o) {
|
|
||||||
reset[o] = bias_[offset + o];
|
|
||||||
for (size_t i = 0; i < input_size_; ++i) { // Add input.
|
|
||||||
reset[o] += input[i] * weights_[offset + i * stride + o];
|
|
||||||
}
|
|
||||||
for (size_t s = 0; s < output_size_; ++s) { // Add state.
|
|
||||||
reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
|
|
||||||
}
|
|
||||||
reset[o] = SigmoidApproximated(kWeightsScale * reset[o]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute output.
|
|
||||||
offset += output_size_;
|
|
||||||
std::array<float, kRecurrentLayersMaxUnits> output;
|
|
||||||
for (size_t o = 0; o < output_size_; ++o) {
|
|
||||||
output[o] = bias_[offset + o];
|
|
||||||
for (size_t i = 0; i < input_size_; ++i) { // Add input.
|
|
||||||
output[o] += input[i] * weights_[offset + i * stride + o];
|
|
||||||
}
|
|
||||||
for (size_t s = 0; s < output_size_;
|
|
||||||
++s) { // Add state through reset gates.
|
|
||||||
output[o] +=
|
|
||||||
state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
|
|
||||||
}
|
|
||||||
output[o] = (*activation_function_)(kWeightsScale * output[o]);
|
|
||||||
// Update output through the update gates.
|
|
||||||
output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the state. Not done in the previous loop since that would pollute
|
|
||||||
// the current state and lead to incorrect output values.
|
|
||||||
std::copy(output.begin(), output.end(), state_.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
RnnBasedVad::RnnBasedVad()
|
|
||||||
: input_layer_(kInputLayerInputSize,
|
|
||||||
kInputLayerOutputSize,
|
|
||||||
kInputDenseBias,
|
|
||||||
kInputDenseWeights,
|
|
||||||
TansigApproximated),
|
|
||||||
hidden_layer_(kInputLayerOutputSize,
|
|
||||||
kHiddenLayerOutputSize,
|
|
||||||
kHiddenGruBias,
|
|
||||||
kHiddenGruWeights,
|
|
||||||
kHiddenGruRecurrentWeights,
|
|
||||||
RectifiedLinearUnit),
|
|
||||||
output_layer_(kHiddenLayerOutputSize,
|
|
||||||
kOutputLayerOutputSize,
|
|
||||||
kOutputDenseBias,
|
|
||||||
kOutputDenseWeights,
|
|
||||||
SigmoidApproximated) {
|
|
||||||
// Input-output chaining size checks.
|
|
||||||
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
|
|
||||||
<< "The input and the hidden layers sizes do not match.";
|
|
||||||
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
|
|
||||||
<< "The hidden and the output layers sizes do not match.";
|
|
||||||
}
|
|
||||||
|
|
||||||
RnnBasedVad::~RnnBasedVad() = default;
|
|
||||||
|
|
||||||
void RnnBasedVad::Reset() {
|
|
||||||
hidden_layer_.Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
void RnnBasedVad::ComputeVadProbability(
|
|
||||||
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector) {
|
|
||||||
input_layer_.ComputeOutput(feature_vector);
|
|
||||||
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
|
|
||||||
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
|
|
||||||
const auto vad_output = output_layer_.GetOutput();
|
|
||||||
vad_probability_ = vad_output[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace rnn_vad
|
|
||||||
} // namespace webrtc
|
|
@ -1,116 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_H_
|
|
||||||
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_H_
|
|
||||||
|
|
||||||
#include <array>
|
|
||||||
|
|
||||||
#include "api/array_view.h"
|
|
||||||
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
namespace rnn_vad {
|
|
||||||
|
|
||||||
// Maximum number of units for a fully-connected layer. This value is used to
|
|
||||||
// over-allocate space for fully-connected layers output vectors (implemented as
|
|
||||||
// std::array). The value should equal the number of units of the largest
|
|
||||||
// fully-connected layer.
|
|
||||||
constexpr size_t kFullyConnectedLayersMaxUnits = 24;
|
|
||||||
|
|
||||||
// Maximum number of units for a recurrent layer. This value is used to
|
|
||||||
// over-allocate space for recurrent layers state vectors (implemented as
|
|
||||||
// std::array). The value should equal the number of units of the largest
|
|
||||||
// recurrent layer.
|
|
||||||
constexpr size_t kRecurrentLayersMaxUnits = 24;
|
|
||||||
|
|
||||||
// Fully-connected layer.
|
|
||||||
class FullyConnectedLayer {
|
|
||||||
public:
|
|
||||||
FullyConnectedLayer(const size_t input_size,
|
|
||||||
const size_t output_size,
|
|
||||||
const rtc::ArrayView<const int8_t> bias,
|
|
||||||
const rtc::ArrayView<const int8_t> weights,
|
|
||||||
float (*const activation_function)(float));
|
|
||||||
FullyConnectedLayer(const FullyConnectedLayer&) = delete;
|
|
||||||
FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
|
|
||||||
~FullyConnectedLayer();
|
|
||||||
size_t input_size() const { return input_size_; }
|
|
||||||
size_t output_size() const { return output_size_; }
|
|
||||||
rtc::ArrayView<const float> GetOutput() const;
|
|
||||||
// Computes the fully-connected layer output.
|
|
||||||
void ComputeOutput(rtc::ArrayView<const float> input);
|
|
||||||
|
|
||||||
private:
|
|
||||||
const size_t input_size_;
|
|
||||||
const size_t output_size_;
|
|
||||||
const rtc::ArrayView<const int8_t> bias_;
|
|
||||||
const rtc::ArrayView<const int8_t> weights_;
|
|
||||||
float (*const activation_function_)(float);
|
|
||||||
// The output vector of a recurrent layer has length equal to |output_size_|.
|
|
||||||
// However, for efficiency, over-allocation is used.
|
|
||||||
std::array<float, kFullyConnectedLayersMaxUnits> output_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Recurrent layer with gated recurrent units (GRUs).
|
|
||||||
class GatedRecurrentLayer {
|
|
||||||
public:
|
|
||||||
GatedRecurrentLayer(const size_t input_size,
|
|
||||||
const size_t output_size,
|
|
||||||
const rtc::ArrayView<const int8_t> bias,
|
|
||||||
const rtc::ArrayView<const int8_t> weights,
|
|
||||||
const rtc::ArrayView<const int8_t> recurrent_weights,
|
|
||||||
float (*const activation_function)(float));
|
|
||||||
GatedRecurrentLayer(const GatedRecurrentLayer&) = delete;
|
|
||||||
GatedRecurrentLayer& operator=(const GatedRecurrentLayer&) = delete;
|
|
||||||
~GatedRecurrentLayer();
|
|
||||||
size_t input_size() const { return input_size_; }
|
|
||||||
size_t output_size() const { return output_size_; }
|
|
||||||
rtc::ArrayView<const float> GetOutput() const;
|
|
||||||
void Reset();
|
|
||||||
// Computes the recurrent layer output and updates the status.
|
|
||||||
void ComputeOutput(rtc::ArrayView<const float> input);
|
|
||||||
|
|
||||||
private:
|
|
||||||
const size_t input_size_;
|
|
||||||
const size_t output_size_;
|
|
||||||
const rtc::ArrayView<const int8_t> bias_;
|
|
||||||
const rtc::ArrayView<const int8_t> weights_;
|
|
||||||
const rtc::ArrayView<const int8_t> recurrent_weights_;
|
|
||||||
float (*const activation_function_)(float);
|
|
||||||
// The state vector of a recurrent layer has length equal to |output_size_|.
|
|
||||||
// However, to avoid dynamic allocation, over-allocation is used.
|
|
||||||
std::array<float, kRecurrentLayersMaxUnits> state_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Recurrent network based VAD.
|
|
||||||
class RnnBasedVad {
|
|
||||||
public:
|
|
||||||
RnnBasedVad();
|
|
||||||
RnnBasedVad(const RnnBasedVad&) = delete;
|
|
||||||
RnnBasedVad& operator=(const RnnBasedVad&) = delete;
|
|
||||||
~RnnBasedVad();
|
|
||||||
float vad_probability() const { return vad_probability_; }
|
|
||||||
void Reset();
|
|
||||||
// Compute and returns the probability of voice (range: [0.0, 1.0]).
|
|
||||||
void ComputeVadProbability(
|
|
||||||
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector);
|
|
||||||
|
|
||||||
private:
|
|
||||||
FullyConnectedLayer input_layer_;
|
|
||||||
GatedRecurrentLayer hidden_layer_;
|
|
||||||
FullyConnectedLayer output_layer_;
|
|
||||||
float vad_probability_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace rnn_vad
|
|
||||||
} // namespace webrtc
|
|
||||||
|
|
||||||
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_H_
|
|
@ -1,180 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <array>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
|
|
||||||
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
|
|
||||||
#include "rtc_base/checks.h"
|
|
||||||
#include "test/gtest.h"
|
|
||||||
#include "third_party/rnnoise/src/rnn_activations.h"
|
|
||||||
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
namespace rnn_vad {
|
|
||||||
namespace test {
|
|
||||||
|
|
||||||
using rnnoise::RectifiedLinearUnit;
|
|
||||||
using rnnoise::SigmoidApproximated;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
void TestFullyConnectedLayer(FullyConnectedLayer* fc,
|
|
||||||
rtc::ArrayView<const float> input_vector,
|
|
||||||
const float expected_output) {
|
|
||||||
RTC_CHECK(fc);
|
|
||||||
fc->ComputeOutput(input_vector);
|
|
||||||
const auto output = fc->GetOutput();
|
|
||||||
EXPECT_NEAR(expected_output, output[0], 3e-6f);
|
|
||||||
}
|
|
||||||
|
|
||||||
void TestGatedRecurrentLayer(
|
|
||||||
GatedRecurrentLayer* gru,
|
|
||||||
rtc::ArrayView<const float> input_sequence,
|
|
||||||
rtc::ArrayView<const float> expected_output_sequence) {
|
|
||||||
RTC_CHECK(gru);
|
|
||||||
auto gru_output_view = gru->GetOutput();
|
|
||||||
const size_t input_sequence_length =
|
|
||||||
rtc::CheckedDivExact(input_sequence.size(), gru->input_size());
|
|
||||||
const size_t output_sequence_length =
|
|
||||||
rtc::CheckedDivExact(expected_output_sequence.size(), gru->output_size());
|
|
||||||
ASSERT_EQ(input_sequence_length, output_sequence_length)
|
|
||||||
<< "The test data length is invalid.";
|
|
||||||
// Feed the GRU layer and check the output at every step.
|
|
||||||
gru->Reset();
|
|
||||||
for (size_t i = 0; i < input_sequence_length; ++i) {
|
|
||||||
SCOPED_TRACE(i);
|
|
||||||
gru->ComputeOutput(
|
|
||||||
input_sequence.subview(i * gru->input_size(), gru->input_size()));
|
|
||||||
const auto expected_output = expected_output_sequence.subview(
|
|
||||||
i * gru->output_size(), gru->output_size());
|
|
||||||
ExpectNearAbsolute(expected_output, gru_output_view, 3e-6f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
// Bit-exactness check for fully connected layers.
|
|
||||||
TEST(RnnVadTest, CheckFullyConnectedLayerOutput) {
|
|
||||||
const std::array<int8_t, 1> bias = {-50};
|
|
||||||
const std::array<int8_t, 24> weights = {
|
|
||||||
127, 127, 127, 127, 127, 20, 127, -126, -126, -54, 14, 125,
|
|
||||||
-126, -126, 127, -125, -126, 127, -127, -127, -57, -30, 127, 80};
|
|
||||||
FullyConnectedLayer fc(24, 1, bias, weights, SigmoidApproximated);
|
|
||||||
// Test on different inputs.
|
|
||||||
{
|
|
||||||
const std::array<float, 24> input_vector = {
|
|
||||||
0.f, 0.f, 0.f, 0.f, 0.f,
|
|
||||||
0.f, 0.215833917f, 0.290601075f, 0.238759011f, 0.244751841f,
|
|
||||||
0.f, 0.0461241305f, 0.106401242f, 0.223070428f, 0.630603909f,
|
|
||||||
0.690453172f, 0.f, 0.387645692f, 0.166913897f, 0.f,
|
|
||||||
0.0327451192f, 0.f, 0.136149868f, 0.446351469f};
|
|
||||||
TestFullyConnectedLayer(&fc, {input_vector}, 0.436567038f);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
const std::array<float, 24> input_vector = {
|
|
||||||
0.592162728f, 0.529089332f, 1.18205106f,
|
|
||||||
1.21736848f, 0.f, 0.470851123f,
|
|
||||||
0.130675942f, 0.320903003f, 0.305496395f,
|
|
||||||
0.0571633279f, 1.57001138f, 0.0182026215f,
|
|
||||||
0.0977443159f, 0.347477973f, 0.493206412f,
|
|
||||||
0.9688586f, 0.0320267938f, 0.244722098f,
|
|
||||||
0.312745273f, 0.f, 0.00650715502f,
|
|
||||||
0.312553257f, 1.62619662f, 0.782880902f};
|
|
||||||
TestFullyConnectedLayer(&fc, {input_vector}, 0.874741316f);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
const std::array<float, 24> input_vector = {
|
|
||||||
0.395022154f, 0.333681047f, 0.76302278f,
|
|
||||||
0.965480626f, 0.f, 0.941198349f,
|
|
||||||
0.0892967582f, 0.745046318f, 0.635769248f,
|
|
||||||
0.238564298f, 0.970656633f, 0.014159563f,
|
|
||||||
0.094203949f, 0.446816623f, 0.640755892f,
|
|
||||||
1.20532358f, 0.0254284926f, 0.283327013f,
|
|
||||||
0.726210058f, 0.0550272502f, 0.000344108557f,
|
|
||||||
0.369803518f, 1.56680179f, 0.997883797f};
|
|
||||||
TestFullyConnectedLayer(&fc, {input_vector}, 0.672785878f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(RnnVadTest, CheckGatedRecurrentLayer) {
|
|
||||||
const std::array<int8_t, 12> bias = {96, -99, -81, -114, 49, 119,
|
|
||||||
-118, 68, -76, 91, 121, 125};
|
|
||||||
const std::array<int8_t, 60> weights = {
|
|
||||||
124, 9, 1, 116, -66, -21, -118, -110, 104, 75, -23, -51,
|
|
||||||
-72, -111, 47, 93, 77, -98, 41, -8, 40, -23, -43, -107,
|
|
||||||
9, -73, 30, -32, -2, 64, -26, 91, -48, -24, -28, -104,
|
|
||||||
74, -46, 116, 15, 32, 52, -126, -38, -121, 12, -16, 110,
|
|
||||||
-95, 66, -103, -35, -38, 3, -126, -61, 28, 98, -117, -43};
|
|
||||||
const std::array<int8_t, 60> recurrent_weights = {
|
|
||||||
-3, 87, 50, 51, -22, 27, -39, 62, 31, -83, -52, -48,
|
|
||||||
-6, 83, -19, 104, 105, 48, 23, 68, 23, 40, 7, -120,
|
|
||||||
64, -62, 117, 85, -51, -43, 54, -105, 120, 56, -128, -107,
|
|
||||||
39, 50, -17, -47, -117, 14, 108, 12, -7, -72, 103, -87,
|
|
||||||
-66, 82, 84, 100, -98, 102, -49, 44, 122, 106, -20, -69};
|
|
||||||
GatedRecurrentLayer gru(5, 4, bias, weights, recurrent_weights,
|
|
||||||
RectifiedLinearUnit);
|
|
||||||
// Test on different inputs.
|
|
||||||
{
|
|
||||||
const std::array<float, 20> input_sequence = {
|
|
||||||
0.89395463f, 0.93224651f, 0.55788344f, 0.32341808f, 0.93355054f,
|
|
||||||
0.13475326f, 0.97370994f, 0.14253306f, 0.93710381f, 0.76093364f,
|
|
||||||
0.65780413f, 0.41657975f, 0.49403164f, 0.46843281f, 0.75138855f,
|
|
||||||
0.24517593f, 0.47657707f, 0.57064998f, 0.435184f, 0.19319285f};
|
|
||||||
const std::array<float, 16> expected_output_sequence = {
|
|
||||||
0.0239123f, 0.5773077f, 0.f, 0.f,
|
|
||||||
0.01282811f, 0.64330572f, 0.f, 0.04863098f,
|
|
||||||
0.00781069f, 0.75267816f, 0.f, 0.02579715f,
|
|
||||||
0.00471378f, 0.59162533f, 0.11087593f, 0.01334511f};
|
|
||||||
TestGatedRecurrentLayer(&gru, input_sequence, expected_output_sequence);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(bugs.webrtc.org/9076): Remove when the issue is fixed.
|
|
||||||
// Bit-exactness test checking that precomputed frame-wise features lead to the
|
|
||||||
// expected VAD probabilities.
|
|
||||||
TEST(RnnVadTest, RnnBitExactness) {
|
|
||||||
// Init.
|
|
||||||
auto features_reader = CreateSilenceFlagsFeatureMatrixReader();
|
|
||||||
auto vad_probs_reader = CreateVadProbsReader();
|
|
||||||
ASSERT_EQ(features_reader.second, vad_probs_reader.second);
|
|
||||||
const size_t num_frames = features_reader.second;
|
|
||||||
// Frame-wise buffers.
|
|
||||||
float expected_vad_probability;
|
|
||||||
float is_silence;
|
|
||||||
std::array<float, kFeatureVectorSize> features;
|
|
||||||
|
|
||||||
// Compute VAD probability using the precomputed features.
|
|
||||||
RnnBasedVad vad;
|
|
||||||
for (size_t i = 0; i < num_frames; ++i) {
|
|
||||||
SCOPED_TRACE(i);
|
|
||||||
// Read frame data.
|
|
||||||
RTC_CHECK(vad_probs_reader.first->ReadValue(&expected_vad_probability));
|
|
||||||
// The features file also includes a silence flag for each frame.
|
|
||||||
RTC_CHECK(features_reader.first->ReadValue(&is_silence));
|
|
||||||
RTC_CHECK(
|
|
||||||
features_reader.first->ReadChunk({features.data(), features.size()}));
|
|
||||||
// Skip silent frames.
|
|
||||||
ASSERT_TRUE(is_silence == 0.f || is_silence == 1.f);
|
|
||||||
if (is_silence == 1.f) {
|
|
||||||
ASSERT_EQ(expected_vad_probability, 0.f);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Compute and check VAD probability.
|
|
||||||
vad.ComputeVadProbability({features.data(), features.size()});
|
|
||||||
EXPECT_NEAR(expected_vad_probability, vad.vad_probability(), 3e-6f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace test
|
|
||||||
} // namespace rnn_vad
|
|
||||||
} // namespace webrtc
|
|
@ -53,21 +53,6 @@ ReaderPairType CreateLpResidualAndPitchPeriodGainReader() {
|
|||||||
rtc::CheckedDivExact(ptr->data_length(), 2 + num_lp_residual_coeffs)};
|
rtc::CheckedDivExact(ptr->data_length(), 2 + num_lp_residual_coeffs)};
|
||||||
}
|
}
|
||||||
|
|
||||||
ReaderPairType CreateSilenceFlagsFeatureMatrixReader() {
|
|
||||||
auto ptr = rtc::MakeUnique<BinaryFileReader<float>>(
|
|
||||||
test::ResourcePath("audio_processing/agc2/rnn_vad/sil_features", "dat"),
|
|
||||||
42);
|
|
||||||
// Features (42) and silence flag.
|
|
||||||
return {std::move(ptr),
|
|
||||||
rtc::CheckedDivExact(ptr->data_length(), static_cast<size_t>(43))};
|
|
||||||
}
|
|
||||||
|
|
||||||
ReaderPairType CreateVadProbsReader() {
|
|
||||||
auto ptr = rtc::MakeUnique<BinaryFileReader<float>>(
|
|
||||||
test::ResourcePath("audio_processing/agc2/rnn_vad/vad_prob", "dat"));
|
|
||||||
return {std::move(ptr), ptr->data_length()};
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace test
|
} // namespace test
|
||||||
} // namespace rnn_vad
|
} // namespace rnn_vad
|
||||||
} // namespace webrtc
|
} // namespace webrtc
|
||||||
|
@ -95,12 +95,6 @@ CreatePitchBuffer24kHzReader();
|
|||||||
// and gain values.
|
// and gain values.
|
||||||
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
||||||
CreateLpResidualAndPitchPeriodGainReader();
|
CreateLpResidualAndPitchPeriodGainReader();
|
||||||
// Instance a reader for the silence flags and the feature matrix.
|
|
||||||
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
|
||||||
CreateSilenceFlagsFeatureMatrixReader();
|
|
||||||
// Instance a reader for the VAD probabilities.
|
|
||||||
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
|
|
||||||
CreateVadProbsReader();
|
|
||||||
|
|
||||||
} // namespace test
|
} // namespace test
|
||||||
} // namespace rnn_vad
|
} // namespace rnn_vad
|
||||||
|
@ -1 +0,0 @@
|
|||||||
e0a92782c2903be9da10385d924d34e8bf212d5e
|
|
@ -1 +0,0 @@
|
|||||||
05735ede0b457318e307d12f5acfd11bbbbd0afd
|
|
@ -44,7 +44,6 @@ LIB_TO_LICENSES_DICT = {
|
|||||||
'openmax_dl': ['third_party/openmax_dl/LICENSE'],
|
'openmax_dl': ['third_party/openmax_dl/LICENSE'],
|
||||||
'opus': ['third_party/opus/src/COPYING'],
|
'opus': ['third_party/opus/src/COPYING'],
|
||||||
'protobuf': ['third_party/protobuf/LICENSE'],
|
'protobuf': ['third_party/protobuf/LICENSE'],
|
||||||
'rnnoise': ['third_party/rnnoise/COPYING'],
|
|
||||||
'usrsctp': ['third_party/usrsctp/LICENSE'],
|
'usrsctp': ['third_party/usrsctp/LICENSE'],
|
||||||
'webrtc': ['LICENSE', 'LICENSE_THIRD_PARTY'],
|
'webrtc': ['LICENSE', 'LICENSE_THIRD_PARTY'],
|
||||||
'zlib': ['third_party/zlib/LICENSE'],
|
'zlib': ['third_party/zlib/LICENSE'],
|
||||||
|
Reference in New Issue
Block a user