RNN VAD: FC layer isolated into rnn_fc.h/.cc
Refactoring done to more easily and cleanly add SIMD optimizations and to remove `FullyConnectedLayer` from the RNN VAD api. Minor improvements (readability, API): - `FullyConnectedLayer` gets the ActivationFunction enum and not a function view anymore - SSE2 optimization moved into `FullyConnectedLayer::ComputeOutputSse2` - layer name added for improved logs Bug: webrtc:10480 Change-Id: Ida4903a67655e19ef0464f378c433c1f6e96dca7 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/195444 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Sam Zackrisson <saza@webrtc.org> Cr-Commit-Position: refs/heads/master@{#32766}
This commit is contained in:
committed by
Commit Bot
parent
a760bca072
commit
31d3b217d3
@ -24,6 +24,7 @@ rtc_library("rnn_vad") {
|
||||
|
||||
deps = [
|
||||
":rnn_vad_common",
|
||||
":rnn_vad_layers",
|
||||
":rnn_vad_lp_residual",
|
||||
":rnn_vad_pitch",
|
||||
":rnn_vad_sequence_buffer",
|
||||
@ -78,6 +79,24 @@ rtc_library("rnn_vad_lp_residual") {
|
||||
]
|
||||
}
|
||||
|
||||
rtc_source_set("rnn_vad_layers") {
|
||||
sources = [
|
||||
"rnn_fc.cc",
|
||||
"rnn_fc.h",
|
||||
]
|
||||
deps = [
|
||||
":rnn_vad_common",
|
||||
"..:cpu_features",
|
||||
"../../../../api:array_view",
|
||||
"../../../../api:function_view",
|
||||
"../../../../rtc_base:checks",
|
||||
"../../../../rtc_base:safe_conversions",
|
||||
"../../../../rtc_base/system:arch",
|
||||
"//third_party/rnnoise:rnn_vad",
|
||||
]
|
||||
absl_deps = [ "//third_party/abseil-cpp/absl/strings" ]
|
||||
}
|
||||
|
||||
rtc_source_set("vector_math") {
|
||||
sources = [ "vector_math.h" ]
|
||||
deps = [
|
||||
@ -221,6 +240,7 @@ if (rtc_include_tests) {
|
||||
"pitch_search_internal_unittest.cc",
|
||||
"pitch_search_unittest.cc",
|
||||
"ring_buffer_unittest.cc",
|
||||
"rnn_fc_unittest.cc",
|
||||
"rnn_unittest.cc",
|
||||
"rnn_vad_unittest.cc",
|
||||
"sequence_buffer_unittest.cc",
|
||||
@ -233,6 +253,7 @@ if (rtc_include_tests) {
|
||||
":rnn_vad",
|
||||
":rnn_vad_auto_correlation",
|
||||
":rnn_vad_common",
|
||||
":rnn_vad_layers",
|
||||
":rnn_vad_lp_residual",
|
||||
":rnn_vad_pitch",
|
||||
":rnn_vad_ring_buffer",
|
||||
|
||||
@ -60,37 +60,6 @@ inline float RectifiedLinearUnit(float x) {
|
||||
return x < 0.f ? 0.f : x;
|
||||
}
|
||||
|
||||
std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
|
||||
std::vector<float> scaled_params(params.size());
|
||||
std::transform(params.begin(), params.end(), scaled_params.begin(),
|
||||
[](int8_t x) -> float {
|
||||
return rnnoise::kWeightsScale * static_cast<float>(x);
|
||||
});
|
||||
return scaled_params;
|
||||
}
|
||||
|
||||
// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
|
||||
// function to improve setup time.
|
||||
// Casts and scales |weights| and re-arranges the layout.
|
||||
std::vector<float> GetPreprocessedFcWeights(
|
||||
rtc::ArrayView<const int8_t> weights,
|
||||
int output_size) {
|
||||
if (output_size == 1) {
|
||||
return GetScaledParams(weights);
|
||||
}
|
||||
// Transpose, scale and cast.
|
||||
const int input_size = rtc::CheckedDivExact(
|
||||
rtc::dchecked_cast<int>(weights.size()), output_size);
|
||||
std::vector<float> w(weights.size());
|
||||
for (int o = 0; o < output_size; ++o) {
|
||||
for (int i = 0; i < input_size; ++i) {
|
||||
w[o * input_size + i] = rnnoise::kWeightsScale *
|
||||
static_cast<float>(weights[i * output_size + o]);
|
||||
}
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
||||
constexpr int kNumGruGates = 3; // Update, reset, output.
|
||||
|
||||
// TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
|
||||
@ -202,106 +171,8 @@ void ComputeGruLayerOutput(int input_size,
|
||||
}
|
||||
}
|
||||
|
||||
// Fully connected layer un-optimized implementation.
|
||||
void ComputeFullyConnectedLayerOutput(
|
||||
int input_size,
|
||||
int output_size,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
rtc::ArrayView<float> output) {
|
||||
RTC_DCHECK_EQ(input.size(), input_size);
|
||||
RTC_DCHECK_EQ(bias.size(), output_size);
|
||||
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
||||
for (int o = 0; o < output_size; ++o) {
|
||||
output[o] = bias[o];
|
||||
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
||||
// |weights_| change the performance across different platforms.
|
||||
for (int i = 0; i < input_size; ++i) {
|
||||
output[o] += input[i] * weights[o * input_size + i];
|
||||
}
|
||||
output[o] = activation_function(output[o]);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
// Fully connected layer SSE2 implementation.
|
||||
void ComputeFullyConnectedLayerOutputSse2(
|
||||
int input_size,
|
||||
int output_size,
|
||||
rtc::ArrayView<const float> input,
|
||||
rtc::ArrayView<const float> bias,
|
||||
rtc::ArrayView<const float> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
rtc::ArrayView<float> output) {
|
||||
RTC_DCHECK_EQ(input.size(), input_size);
|
||||
RTC_DCHECK_EQ(bias.size(), output_size);
|
||||
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
||||
const int input_size_by_4 = input_size >> 2;
|
||||
const int offset = input_size & ~3;
|
||||
__m128 sum_wx_128;
|
||||
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
|
||||
for (int o = 0; o < output_size; ++o) {
|
||||
// Perform 128 bit vector operations.
|
||||
sum_wx_128 = _mm_set1_ps(0);
|
||||
const float* x_p = input.data();
|
||||
const float* w_p = weights.data() + o * input_size;
|
||||
for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
|
||||
sum_wx_128 = _mm_add_ps(sum_wx_128,
|
||||
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
|
||||
}
|
||||
// Perform non-vector operations for any remaining items, sum up bias term
|
||||
// and results from the vectorized code, and apply the activation function.
|
||||
output[o] = activation_function(
|
||||
std::inner_product(input.begin() + offset, input.end(),
|
||||
weights.begin() + o * input_size + offset,
|
||||
bias[o] + v[0] + v[1] + v[2] + v[3]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
FullyConnectedLayer::FullyConnectedLayer(
|
||||
const int input_size,
|
||||
const int output_size,
|
||||
const rtc::ArrayView<const int8_t> bias,
|
||||
const rtc::ArrayView<const int8_t> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
const AvailableCpuFeatures& cpu_features)
|
||||
: input_size_(input_size),
|
||||
output_size_(output_size),
|
||||
bias_(GetScaledParams(bias)),
|
||||
weights_(GetPreprocessedFcWeights(weights, output_size)),
|
||||
activation_function_(activation_function),
|
||||
cpu_features_(cpu_features) {
|
||||
RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
|
||||
<< "Static over-allocation of fully-connected layers output vectors is "
|
||||
"not sufficient.";
|
||||
RTC_DCHECK_EQ(output_size_, bias_.size())
|
||||
<< "Mismatching output size and bias terms array size.";
|
||||
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
||||
<< "Mismatching input-output size and weight coefficients array size.";
|
||||
}
|
||||
|
||||
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
||||
|
||||
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
// TODO(bugs.chromium.org/10480): Add AVX2.
|
||||
if (cpu_features_.sse2) {
|
||||
ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
|
||||
bias_, weights_, activation_function_,
|
||||
output_);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// TODO(bugs.chromium.org/10480): Add Neon.
|
||||
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
||||
weights_, activation_function_, output_);
|
||||
}
|
||||
|
||||
GatedRecurrentLayer::GatedRecurrentLayer(
|
||||
const int input_size,
|
||||
const int output_size,
|
||||
@ -346,8 +217,9 @@ RnnVad::RnnVad(const AvailableCpuFeatures& cpu_features)
|
||||
kInputLayerOutputSize,
|
||||
kInputDenseBias,
|
||||
kInputDenseWeights,
|
||||
TansigApproximated,
|
||||
cpu_features),
|
||||
ActivationFunction::kTansigApproximated,
|
||||
cpu_features,
|
||||
/*layer_name=*/"FC1"),
|
||||
hidden_(kInputLayerOutputSize,
|
||||
kHiddenLayerOutputSize,
|
||||
kHiddenGruBias,
|
||||
@ -357,8 +229,9 @@ RnnVad::RnnVad(const AvailableCpuFeatures& cpu_features)
|
||||
kOutputLayerOutputSize,
|
||||
kOutputDenseBias,
|
||||
kOutputDenseWeights,
|
||||
SigmoidApproximated,
|
||||
cpu_features) {
|
||||
ActivationFunction::kSigmoidApproximated,
|
||||
cpu_features,
|
||||
/*layer_name=*/"FC2") {
|
||||
// Input-output chaining size checks.
|
||||
RTC_DCHECK_EQ(input_.size(), hidden_.input_size())
|
||||
<< "The input and the hidden layers sizes do not match.";
|
||||
|
||||
@ -21,54 +21,15 @@
|
||||
#include "api/function_view.h"
|
||||
#include "modules/audio_processing/agc2/cpu_features.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/rnn_fc.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
|
||||
// Maximum number of units for an FC layer.
|
||||
constexpr int kFullyConnectedLayerMaxUnits = 24;
|
||||
|
||||
// Maximum number of units for a GRU layer.
|
||||
constexpr int kGruLayerMaxUnits = 24;
|
||||
|
||||
// Fully-connected layer with a custom activation function which owns the output
|
||||
// buffer.
|
||||
class FullyConnectedLayer {
|
||||
public:
|
||||
// Ctor. `output_size` cannot be greater than `kFullyConnectedLayerMaxUnits`.
|
||||
FullyConnectedLayer(int input_size,
|
||||
int output_size,
|
||||
rtc::ArrayView<const int8_t> bias,
|
||||
rtc::ArrayView<const int8_t> weights,
|
||||
rtc::FunctionView<float(float)> activation_function,
|
||||
const AvailableCpuFeatures& cpu_features);
|
||||
FullyConnectedLayer(const FullyConnectedLayer&) = delete;
|
||||
FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
|
||||
~FullyConnectedLayer();
|
||||
|
||||
// Returns the size of the input vector.
|
||||
int input_size() const { return input_size_; }
|
||||
// Returns the pointer to the first element of the output buffer.
|
||||
const float* data() const { return output_.data(); }
|
||||
// Returns the size of the output buffer.
|
||||
int size() const { return output_size_; }
|
||||
|
||||
// Computes the fully-connected layer output.
|
||||
void ComputeOutput(rtc::ArrayView<const float> input);
|
||||
|
||||
private:
|
||||
const int input_size_;
|
||||
const int output_size_;
|
||||
const std::vector<float> bias_;
|
||||
const std::vector<float> weights_;
|
||||
rtc::FunctionView<float(float)> activation_function_;
|
||||
// The output vector of a recurrent layer has length equal to |output_size_|.
|
||||
// However, for efficiency, over-allocation is used.
|
||||
std::array<float, kFullyConnectedLayerMaxUnits> output_;
|
||||
const AvailableCpuFeatures cpu_features_;
|
||||
};
|
||||
|
||||
// Recurrent layer with gated recurrent units (GRUs) with sigmoid and ReLU as
|
||||
// activation functions for the update/reset and output gates respectively. It
|
||||
// owns the output buffer.
|
||||
|
||||
151
modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
Normal file
151
modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
Normal file
@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
#include "modules/audio_processing/agc2/rnn_vad/rnn_fc.h"
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/numerics/safe_conversions.h"
|
||||
#include "third_party/rnnoise/src/rnn_activations.h"
|
||||
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
namespace {
|
||||
|
||||
std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
|
||||
std::vector<float> scaled_params(params.size());
|
||||
std::transform(params.begin(), params.end(), scaled_params.begin(),
|
||||
[](int8_t x) -> float {
|
||||
return ::rnnoise::kWeightsScale * static_cast<float>(x);
|
||||
});
|
||||
return scaled_params;
|
||||
}
|
||||
|
||||
// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
|
||||
// function to improve setup time.
|
||||
// Casts and scales |weights| and re-arranges the layout.
|
||||
std::vector<float> PreprocessWeights(rtc::ArrayView<const int8_t> weights,
|
||||
int output_size) {
|
||||
if (output_size == 1) {
|
||||
return GetScaledParams(weights);
|
||||
}
|
||||
// Transpose, scale and cast.
|
||||
const int input_size = rtc::CheckedDivExact(
|
||||
rtc::dchecked_cast<int>(weights.size()), output_size);
|
||||
std::vector<float> w(weights.size());
|
||||
for (int o = 0; o < output_size; ++o) {
|
||||
for (int i = 0; i < input_size; ++i) {
|
||||
w[o * input_size + i] = rnnoise::kWeightsScale *
|
||||
static_cast<float>(weights[i * output_size + o]);
|
||||
}
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
||||
rtc::FunctionView<float(float)> GetActivationFunction(
|
||||
ActivationFunction activation_function) {
|
||||
switch (activation_function) {
|
||||
case ActivationFunction::kTansigApproximated:
|
||||
return ::rnnoise::TansigApproximated;
|
||||
break;
|
||||
case ActivationFunction::kSigmoidApproximated:
|
||||
return ::rnnoise::SigmoidApproximated;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
FullyConnectedLayer::FullyConnectedLayer(
|
||||
const int input_size,
|
||||
const int output_size,
|
||||
const rtc::ArrayView<const int8_t> bias,
|
||||
const rtc::ArrayView<const int8_t> weights,
|
||||
ActivationFunction activation_function,
|
||||
const AvailableCpuFeatures& cpu_features,
|
||||
absl::string_view layer_name)
|
||||
: input_size_(input_size),
|
||||
output_size_(output_size),
|
||||
bias_(GetScaledParams(bias)),
|
||||
weights_(PreprocessWeights(weights, output_size)),
|
||||
cpu_features_(cpu_features),
|
||||
activation_function_(GetActivationFunction(activation_function)) {
|
||||
RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
|
||||
<< "Insufficient FC layer over-allocation (" << layer_name << ").";
|
||||
RTC_DCHECK_EQ(output_size_, bias_.size())
|
||||
<< "Mismatching output size and bias terms array size (" << layer_name
|
||||
<< ").";
|
||||
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
||||
<< "Mismatching input-output size and weight coefficients array size ("
|
||||
<< layer_name << ").";
|
||||
}
|
||||
|
||||
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
||||
|
||||
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||||
RTC_DCHECK_EQ(input.size(), input_size_);
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
// TODO(bugs.chromium.org/10480): Add AVX2.
|
||||
if (cpu_features_.sse2) {
|
||||
ComputeOutputSse2(input);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// TODO(bugs.chromium.org/10480): Add Neon.
|
||||
|
||||
// Un-optimized implementation.
|
||||
for (int o = 0; o < output_size_; ++o) {
|
||||
output_[o] = bias_[o];
|
||||
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
||||
// |weights_| change the performance across different platforms.
|
||||
for (int i = 0; i < input_size_; ++i) {
|
||||
output_[o] += input[i] * weights_[o * input_size_ + i];
|
||||
}
|
||||
output_[o] = activation_function_(output_[o]);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView<const float> input) {
|
||||
const int input_size_by_4 = input_size_ >> 2;
|
||||
const int offset = input_size_ & ~3;
|
||||
// TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok.
|
||||
__m128 sum_wx_128;
|
||||
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
|
||||
for (int o = 0; o < output_size_; ++o) {
|
||||
// Perform 128 bit vector operations.
|
||||
sum_wx_128 = _mm_set1_ps(0);
|
||||
const float* x_p = input.data();
|
||||
const float* w_p = weights_.data() + o * input.size();
|
||||
for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
|
||||
sum_wx_128 = _mm_add_ps(sum_wx_128,
|
||||
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
|
||||
}
|
||||
// Perform non-vector operations for any remaining items, sum up bias term
|
||||
// and results from the vectorized code, and apply the activation function.
|
||||
output_[o] = activation_function_(
|
||||
std::inner_product(input.begin() + offset, input.end(),
|
||||
weights_.begin() + o * input.size() + offset,
|
||||
bias_[o] + v[0] + v[1] + v[2] + v[3]));
|
||||
}
|
||||
}
|
||||
#endif // defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
|
||||
} // namespace rnn_vad
|
||||
} // namespace webrtc
|
||||
76
modules/audio_processing/agc2/rnn_vad/rnn_fc.h
Normal file
76
modules/audio_processing/agc2/rnn_vad/rnn_fc.h
Normal file
@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_FC_H_
|
||||
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_FC_H_
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "api/array_view.h"
|
||||
#include "api/function_view.h"
|
||||
#include "modules/audio_processing/agc2/cpu_features.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
|
||||
// Activation function for a neural network cell.
|
||||
enum class ActivationFunction { kTansigApproximated, kSigmoidApproximated };
|
||||
|
||||
// Maximum number of units for an FC layer.
|
||||
constexpr int kFullyConnectedLayerMaxUnits = 24;
|
||||
|
||||
// Fully-connected layer with a custom activation function which owns the output
|
||||
// buffer.
|
||||
class FullyConnectedLayer {
|
||||
public:
|
||||
// Ctor. `output_size` cannot be greater than `kFullyConnectedLayerMaxUnits`.
|
||||
FullyConnectedLayer(int input_size,
|
||||
int output_size,
|
||||
rtc::ArrayView<const int8_t> bias,
|
||||
rtc::ArrayView<const int8_t> weights,
|
||||
ActivationFunction activation_function,
|
||||
const AvailableCpuFeatures& cpu_features,
|
||||
absl::string_view layer_name);
|
||||
FullyConnectedLayer(const FullyConnectedLayer&) = delete;
|
||||
FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
|
||||
~FullyConnectedLayer();
|
||||
|
||||
// Returns the size of the input vector.
|
||||
int input_size() const { return input_size_; }
|
||||
// Returns the pointer to the first element of the output buffer.
|
||||
const float* data() const { return output_.data(); }
|
||||
// Returns the size of the output buffer.
|
||||
int size() const { return output_size_; }
|
||||
|
||||
// Computes the fully-connected layer output.
|
||||
void ComputeOutput(rtc::ArrayView<const float> input);
|
||||
|
||||
private:
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
void ComputeOutputSse2(rtc::ArrayView<const float> input);
|
||||
#endif
|
||||
|
||||
const int input_size_;
|
||||
const int output_size_;
|
||||
const std::vector<float> bias_;
|
||||
const std::vector<float> weights_;
|
||||
const AvailableCpuFeatures cpu_features_;
|
||||
rtc::FunctionView<float(float)> activation_function_;
|
||||
// Over-allocated array with size equal to `output_size_`.
|
||||
std::array<float, kFullyConnectedLayerMaxUnits> output_;
|
||||
};
|
||||
|
||||
} // namespace rnn_vad
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_FC_H_
|
||||
109
modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
Normal file
109
modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
Normal file
@ -0,0 +1,109 @@
|
||||
/*
|
||||
* Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "modules/audio_processing/agc2/rnn_vad/rnn_fc.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
#include "api/array_view.h"
|
||||
#include "modules/audio_processing/agc2/cpu_features.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
|
||||
#include "modules/audio_processing/test/performance_timer.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
#include "test/gtest.h"
|
||||
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
namespace test {
|
||||
namespace {
|
||||
|
||||
using ::rnnoise::kInputDenseBias;
|
||||
using ::rnnoise::kInputDenseWeights;
|
||||
using ::rnnoise::kInputLayerInputSize;
|
||||
using ::rnnoise::kInputLayerOutputSize;
|
||||
|
||||
// Fully connected layer test data.
|
||||
constexpr std::array<float, 42> kFullyConnectedInputVector = {
|
||||
-1.00131f, -0.627069f, -7.81097f, 7.86285f, -2.87145f, 3.32365f,
|
||||
-0.653161f, 0.529839f, -0.425307f, 0.25583f, 0.235094f, 0.230527f,
|
||||
-0.144687f, 0.182785f, 0.57102f, 0.125039f, 0.479482f, -0.0255439f,
|
||||
-0.0073141f, -0.147346f, -0.217106f, -0.0846906f, -8.34943f, 3.09065f,
|
||||
1.42628f, -0.85235f, -0.220207f, -0.811163f, 2.09032f, -2.01425f,
|
||||
-0.690268f, -0.925327f, -0.541354f, 0.58455f, -0.606726f, -0.0372358f,
|
||||
0.565991f, 0.435854f, 0.420812f, 0.162198f, -2.13f, 10.0089f};
|
||||
constexpr std::array<float, 24> kFullyConnectedExpectedOutput = {
|
||||
-0.623293f, -0.988299f, 0.999378f, 0.967168f, 0.103087f, -0.978545f,
|
||||
-0.856347f, 0.346675f, 1.f, -0.717442f, -0.544176f, 0.960363f,
|
||||
0.983443f, 0.999991f, -0.824335f, 0.984742f, 0.990208f, 0.938179f,
|
||||
0.875092f, 0.999846f, 0.997707f, -0.999382f, 0.973153f, -0.966605f};
|
||||
|
||||
class RnnParametrization
|
||||
: public ::testing::TestWithParam<AvailableCpuFeatures> {};
|
||||
|
||||
// Checks that the output of a fully connected layer is within tolerance given
|
||||
// test input data.
|
||||
TEST_P(RnnParametrization, CheckFullyConnectedLayerOutput) {
|
||||
FullyConnectedLayer fc(kInputLayerInputSize, kInputLayerOutputSize,
|
||||
kInputDenseBias, kInputDenseWeights,
|
||||
ActivationFunction::kTansigApproximated,
|
||||
/*cpu_features=*/GetParam(),
|
||||
/*layer_name=*/"FC");
|
||||
fc.ComputeOutput(kFullyConnectedInputVector);
|
||||
ExpectNearAbsolute(kFullyConnectedExpectedOutput, fc, 1e-5f);
|
||||
}
|
||||
|
||||
TEST_P(RnnParametrization, DISABLED_BenchmarkFullyConnectedLayer) {
|
||||
const AvailableCpuFeatures cpu_features = GetParam();
|
||||
FullyConnectedLayer fc(kInputLayerInputSize, kInputLayerOutputSize,
|
||||
kInputDenseBias, kInputDenseWeights,
|
||||
ActivationFunction::kTansigApproximated, cpu_features,
|
||||
/*layer_name=*/"FC");
|
||||
|
||||
constexpr int kNumTests = 10000;
|
||||
::webrtc::test::PerformanceTimer perf_timer(kNumTests);
|
||||
for (int k = 0; k < kNumTests; ++k) {
|
||||
perf_timer.StartTimer();
|
||||
fc.ComputeOutput(kFullyConnectedInputVector);
|
||||
perf_timer.StopTimer();
|
||||
}
|
||||
RTC_LOG(LS_INFO) << "CPU features: " << cpu_features.ToString() << " | "
|
||||
<< (perf_timer.GetDurationAverage() / 1000) << " +/- "
|
||||
<< (perf_timer.GetDurationStandardDeviation() / 1000)
|
||||
<< " ms";
|
||||
}
|
||||
|
||||
// Finds the relevant CPU features combinations to test.
|
||||
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
std::vector<AvailableCpuFeatures> v;
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
|
||||
AvailableCpuFeatures available = GetAvailableCpuFeatures();
|
||||
if (available.sse2) {
|
||||
AvailableCpuFeatures features(
|
||||
{/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(features);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
RnnVadTest,
|
||||
RnnParametrization,
|
||||
::testing::ValuesIn(GetCpuFeaturesToTest()),
|
||||
[](const ::testing::TestParamInfo<AvailableCpuFeatures>& info) {
|
||||
return info.param.ToString();
|
||||
});
|
||||
|
||||
} // namespace
|
||||
} // namespace test
|
||||
} // namespace rnn_vad
|
||||
} // namespace webrtc
|
||||
@ -20,9 +20,7 @@
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "rtc_base/numerics/safe_conversions.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
#include "test/gtest.h"
|
||||
#include "third_party/rnnoise/src/rnn_activations.h"
|
||||
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
||||
|
||||
namespace webrtc {
|
||||
@ -67,21 +65,6 @@ void TestGatedRecurrentLayer(
|
||||
}
|
||||
}
|
||||
|
||||
// Fully connected layer test data.
|
||||
constexpr std::array<float, 42> kFullyConnectedInputVector = {
|
||||
-1.00131f, -0.627069f, -7.81097f, 7.86285f, -2.87145f, 3.32365f,
|
||||
-0.653161f, 0.529839f, -0.425307f, 0.25583f, 0.235094f, 0.230527f,
|
||||
-0.144687f, 0.182785f, 0.57102f, 0.125039f, 0.479482f, -0.0255439f,
|
||||
-0.0073141f, -0.147346f, -0.217106f, -0.0846906f, -8.34943f, 3.09065f,
|
||||
1.42628f, -0.85235f, -0.220207f, -0.811163f, 2.09032f, -2.01425f,
|
||||
-0.690268f, -0.925327f, -0.541354f, 0.58455f, -0.606726f, -0.0372358f,
|
||||
0.565991f, 0.435854f, 0.420812f, 0.162198f, -2.13f, 10.0089f};
|
||||
constexpr std::array<float, 24> kFullyConnectedExpectedOutput = {
|
||||
-0.623293f, -0.988299f, 0.999378f, 0.967168f, 0.103087f, -0.978545f,
|
||||
-0.856347f, 0.346675f, 1.f, -0.717442f, -0.544176f, 0.960363f,
|
||||
0.983443f, 0.999991f, -0.824335f, 0.984742f, 0.990208f, 0.938179f,
|
||||
0.875092f, 0.999846f, 0.997707f, -0.999382f, 0.973153f, -0.966605f};
|
||||
|
||||
// Gated recurrent units layer test data.
|
||||
constexpr int kGruInputSize = 5;
|
||||
constexpr int kGruOutputSize = 4;
|
||||
@ -170,61 +153,6 @@ TEST(RnnVadTest, DISABLED_BenchmarkGatedRecurrentLayer) {
|
||||
<< " ms";
|
||||
}
|
||||
|
||||
class RnnParametrization
|
||||
: public ::testing::TestWithParam<AvailableCpuFeatures> {};
|
||||
|
||||
// Checks that the output of a fully connected layer is within tolerance given
|
||||
// test input data.
|
||||
TEST_P(RnnParametrization, CheckFullyConnectedLayerOutput) {
|
||||
FullyConnectedLayer fc(
|
||||
rnnoise::kInputLayerInputSize, rnnoise::kInputLayerOutputSize,
|
||||
rnnoise::kInputDenseBias, rnnoise::kInputDenseWeights,
|
||||
rnnoise::TansigApproximated, /*cpu_features=*/GetParam());
|
||||
fc.ComputeOutput(kFullyConnectedInputVector);
|
||||
ExpectNearAbsolute(kFullyConnectedExpectedOutput, fc, 1e-5f);
|
||||
}
|
||||
|
||||
TEST_P(RnnParametrization, DISABLED_BenchmarkFullyConnectedLayer) {
|
||||
const AvailableCpuFeatures cpu_features = GetParam();
|
||||
FullyConnectedLayer fc(rnnoise::kInputLayerInputSize,
|
||||
rnnoise::kInputLayerOutputSize,
|
||||
rnnoise::kInputDenseBias, rnnoise::kInputDenseWeights,
|
||||
rnnoise::TansigApproximated, cpu_features);
|
||||
|
||||
constexpr int kNumTests = 10000;
|
||||
::webrtc::test::PerformanceTimer perf_timer(kNumTests);
|
||||
for (int k = 0; k < kNumTests; ++k) {
|
||||
perf_timer.StartTimer();
|
||||
fc.ComputeOutput(kFullyConnectedInputVector);
|
||||
perf_timer.StopTimer();
|
||||
}
|
||||
RTC_LOG(LS_INFO) << "CPU features: " << cpu_features.ToString() << " | "
|
||||
<< (perf_timer.GetDurationAverage() / 1000) << " +/- "
|
||||
<< (perf_timer.GetDurationStandardDeviation() / 1000)
|
||||
<< " ms";
|
||||
}
|
||||
|
||||
// Finds the relevant CPU features combinations to test.
|
||||
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
std::vector<AvailableCpuFeatures> v;
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
|
||||
AvailableCpuFeatures available = GetAvailableCpuFeatures();
|
||||
if (available.sse2) {
|
||||
AvailableCpuFeatures features(
|
||||
{/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(features);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
RnnVadTest,
|
||||
RnnParametrization,
|
||||
::testing::ValuesIn(GetCpuFeaturesToTest()),
|
||||
[](const ::testing::TestParamInfo<AvailableCpuFeatures>& info) {
|
||||
return info.param.ToString();
|
||||
});
|
||||
|
||||
// Checks that the speech probability is zero with silence.
|
||||
TEST(RnnVadTest, CheckZeroProbabilityWithSilence) {
|
||||
RnnVad rnn_vad(GetAvailableCpuFeatures());
|
||||
|
||||
Reference in New Issue
Block a user