RNN VAD optimizations: VectorMath::DotProduct() NEON arm64
Results: RNN VAD realtime factor improved from 140x to 195x (+55x) Test device: Pixel 2 XL Benchmark setup: max clock speed forced on all the cores by setting "performance" as scaling governor Bug: webrtc:10480 Change-Id: I3e92f643853ad1fe990db909c578ce78ee826c03 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/198842 Reviewed-by: Per Åhgren <peah@webrtc.org> Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/master@{#32888}
This commit is contained in:
committed by
Commit Bot
parent
5073cba99a
commit
ed9f5f85fd
@ -162,6 +162,13 @@ rtc_library("rnn_vad_with_level") {
|
||||
"vad_with_level.cc",
|
||||
"vad_with_level.h",
|
||||
]
|
||||
|
||||
defines = []
|
||||
if (rtc_build_with_neon && current_cpu != "arm64") {
|
||||
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [ "-mfpu=neon" ]
|
||||
}
|
||||
|
||||
deps = [
|
||||
":common",
|
||||
":cpu_features",
|
||||
|
||||
@ -17,6 +17,7 @@ rtc_library("rnn_vad") {
|
||||
"rnn.h",
|
||||
]
|
||||
|
||||
defines = []
|
||||
if (rtc_build_with_neon && current_cpu != "arm64") {
|
||||
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [ "-mfpu=neon" ]
|
||||
@ -84,6 +85,13 @@ rtc_source_set("rnn_vad_layers") {
|
||||
"rnn_gru.cc",
|
||||
"rnn_gru.h",
|
||||
]
|
||||
|
||||
defines = []
|
||||
if (rtc_build_with_neon && current_cpu != "arm64") {
|
||||
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [ "-mfpu=neon" ]
|
||||
}
|
||||
|
||||
deps = [
|
||||
":rnn_vad_common",
|
||||
":vector_math",
|
||||
@ -138,6 +146,13 @@ rtc_library("rnn_vad_pitch") {
|
||||
"pitch_search_internal.cc",
|
||||
"pitch_search_internal.h",
|
||||
]
|
||||
|
||||
defines = []
|
||||
if (rtc_build_with_neon && current_cpu != "arm64") {
|
||||
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [ "-mfpu=neon" ]
|
||||
}
|
||||
|
||||
deps = [
|
||||
":rnn_vad_auto_correlation",
|
||||
":rnn_vad_common",
|
||||
@ -253,6 +268,13 @@ if (rtc_include_tests) {
|
||||
"symmetric_matrix_buffer_unittest.cc",
|
||||
"vector_math_unittest.cc",
|
||||
]
|
||||
|
||||
defines = []
|
||||
if (rtc_build_with_neon && current_cpu != "arm64") {
|
||||
suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [ "-mfpu=neon" ]
|
||||
}
|
||||
|
||||
deps = [
|
||||
":rnn_vad",
|
||||
":rnn_vad_auto_correlation",
|
||||
|
||||
@ -166,6 +166,9 @@ std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
if (available.sse2) {
|
||||
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
}
|
||||
if (available.neon) {
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/true});
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
@ -14,6 +14,9 @@
|
||||
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
#if defined(WEBRTC_HAS_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
@ -70,8 +73,31 @@ class VectorMath {
|
||||
}
|
||||
return dot_product;
|
||||
}
|
||||
#elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64)
|
||||
if (cpu_features_.neon) {
|
||||
float32x4_t accumulator = vdupq_n_f32(0.f);
|
||||
constexpr int kBlockSizeLog2 = 2;
|
||||
constexpr int kBlockSize = 1 << kBlockSizeLog2;
|
||||
const int incomplete_block_index = (x.size() >> kBlockSizeLog2)
|
||||
<< kBlockSizeLog2;
|
||||
for (int i = 0; i < incomplete_block_index; i += kBlockSize) {
|
||||
RTC_DCHECK_LE(i + kBlockSize, x.size());
|
||||
const float32x4_t x_i = vld1q_f32(&x[i]);
|
||||
const float32x4_t y_i = vld1q_f32(&y[i]);
|
||||
accumulator = vfmaq_f32(accumulator, x_i, y_i);
|
||||
}
|
||||
// Reduce `accumulator` by addition.
|
||||
const float32x2_t tmp =
|
||||
vpadd_f32(vget_low_f32(accumulator), vget_high_f32(accumulator));
|
||||
float dot_product = vget_lane_f32(vpadd_f32(tmp, vrev64_f32(tmp)), 0);
|
||||
// Add the result for the last block if incomplete.
|
||||
for (int i = incomplete_block_index;
|
||||
i < rtc::dchecked_cast<int>(x.size()); ++i) {
|
||||
dot_product += x[i] * y[i];
|
||||
}
|
||||
return dot_product;
|
||||
}
|
||||
#endif
|
||||
// TODO(bugs.webrtc.org/10480): Add NEON alternative implementation.
|
||||
return std::inner_product(x.begin(), x.end(), y.begin(), 0.f);
|
||||
}
|
||||
|
||||
|
||||
@ -52,6 +52,9 @@ std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
if (available.sse2) {
|
||||
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
}
|
||||
if (available.neon) {
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/true});
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user