RNN VAD optimizations: VectorMath::DotProduct() NEON arm64

Results: RNN VAD realtime factor improved from 140x to 195x (+55x)
Test device: Pixel 2 XL
Benchmark setup: max clock speed forced on all the cores by
setting "performance" as scaling governor

Bug: webrtc:10480
Change-Id: I3e92f643853ad1fe990db909c578ce78ee826c03
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/198842
Reviewed-by: Per Åhgren <peah@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32888}
This commit is contained in:
Alessio Bazzica
2020-12-28 17:57:24 +01:00
committed by Commit Bot
parent 5073cba99a
commit ed9f5f85fd
5 changed files with 62 additions and 1 deletions

View File

@ -162,6 +162,13 @@ rtc_library("rnn_vad_with_level") {
"vad_with_level.cc",
"vad_with_level.h",
]
defines = []
if (rtc_build_with_neon && current_cpu != "arm64") {
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
}
deps = [
":common",
":cpu_features",

View File

@ -17,6 +17,7 @@ rtc_library("rnn_vad") {
"rnn.h",
]
defines = []
if (rtc_build_with_neon && current_cpu != "arm64") {
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
@ -84,6 +85,13 @@ rtc_source_set("rnn_vad_layers") {
"rnn_gru.cc",
"rnn_gru.h",
]
defines = []
if (rtc_build_with_neon && current_cpu != "arm64") {
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
}
deps = [
":rnn_vad_common",
":vector_math",
@ -138,6 +146,13 @@ rtc_library("rnn_vad_pitch") {
"pitch_search_internal.cc",
"pitch_search_internal.h",
]
defines = []
if (rtc_build_with_neon && current_cpu != "arm64") {
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
}
deps = [
":rnn_vad_auto_correlation",
":rnn_vad_common",
@ -253,6 +268,13 @@ if (rtc_include_tests) {
"symmetric_matrix_buffer_unittest.cc",
"vector_math_unittest.cc",
]
defines = []
if (rtc_build_with_neon && current_cpu != "arm64") {
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
}
deps = [
":rnn_vad",
":rnn_vad_auto_correlation",

View File

@ -166,6 +166,9 @@ std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
if (available.sse2) {
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
}
if (available.neon) {
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/true});
}
return v;
}

View File

@ -14,6 +14,9 @@
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
#include "rtc_base/system/arch.h"
#if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h>
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
#endif
@ -70,8 +73,31 @@ class VectorMath {
}
return dot_product;
}
#elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64)
if (cpu_features_.neon) {
float32x4_t accumulator = vdupq_n_f32(0.f);
constexpr int kBlockSizeLog2 = 2;
constexpr int kBlockSize = 1 << kBlockSizeLog2;
const int incomplete_block_index = (x.size() >> kBlockSizeLog2)
<< kBlockSizeLog2;
for (int i = 0; i < incomplete_block_index; i += kBlockSize) {
RTC_DCHECK_LE(i + kBlockSize, x.size());
const float32x4_t x_i = vld1q_f32(&x[i]);
const float32x4_t y_i = vld1q_f32(&y[i]);
accumulator = vfmaq_f32(accumulator, x_i, y_i);
}
// Reduce `accumulator` by addition.
const float32x2_t tmp =
vpadd_f32(vget_low_f32(accumulator), vget_high_f32(accumulator));
float dot_product = vget_lane_f32(vpadd_f32(tmp, vrev64_f32(tmp)), 0);
// Add the result for the last block if incomplete.
for (int i = incomplete_block_index;
i < rtc::dchecked_cast<int>(x.size()); ++i) {
dot_product += x[i] * y[i];
}
return dot_product;
}
#endif
// TODO(bugs.webrtc.org/10480): Add NEON alternative implementation.
return std::inner_product(x.begin(), x.end(), y.begin(), 0.f);
}

View File

@ -52,6 +52,9 @@ std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
if (available.sse2) {
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
}
if (available.neon) {
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/true});
}
return v;
}