RNN VAD: SSE2 optimization for VectorMath::DotProduct

Bug: webrtc:10480
Change-Id: I9f40352308bbfd5ea72a2607e7d1184cb6b85333
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/194328
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32745}
This commit is contained in:
Alessio Bazzica
2020-11-26 14:29:46 +01:00
committed by Commit Bot
parent 04ee79c749
commit b6e840c036
2 changed files with 38 additions and 5 deletions

View File

@ -11,6 +11,13 @@
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_VECTOR_MATH_H_
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_VECTOR_MATH_H_
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
#include "rtc_base/system/arch.h"
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
#endif
#include <numeric>
#include "api/array_view.h"
@ -31,14 +38,39 @@ class VectorMath {
// Computes the dot product between two equally sized vectors.
float DotProduct(rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y) const {
RTC_DCHECK_EQ(x.size(), y.size());
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (cpu_features_.avx2) {
return DotProductAvx2(x, y);
} else if (cpu_features_.sse2) {
__m128 accumulator = _mm_setzero_ps();
constexpr int kBlockSizeLog2 = 2;
constexpr int kBlockSize = 1 << kBlockSizeLog2;
const int incomplete_block_index = (x.size() >> kBlockSizeLog2)
<< kBlockSizeLog2;
for (int i = 0; i < incomplete_block_index; i += kBlockSize) {
RTC_DCHECK_LE(i + kBlockSize, x.size());
const __m128 x_i = _mm_loadu_ps(&x[i]);
const __m128 y_i = _mm_loadu_ps(&y[i]);
// Multiply-add.
const __m128 z_j = _mm_mul_ps(x_i, y_i);
accumulator = _mm_add_ps(accumulator, z_j);
}
// Reduce `accumulator` by addition.
__m128 high = _mm_movehl_ps(accumulator, accumulator);
accumulator = _mm_add_ps(accumulator, high);
high = _mm_shuffle_ps(accumulator, accumulator, 1);
accumulator = _mm_add_ps(accumulator, high);
float dot_product = _mm_cvtss_f32(accumulator);
// Add the result for the last block if incomplete.
for (int i = incomplete_block_index; static_cast<size_t>(i) < x.size();
++i) {
dot_product += x[i] * y[i];
}
return dot_product;
}
// TODO(bugs.webrtc.org/10480): Add SSE2 alternative implementation.
#endif
// TODO(bugs.webrtc.org/10480): Add NEON alternative implementation.
RTC_DCHECK_EQ(x.size(), y.size());
return std::inner_product(x.begin(), x.end(), y.begin(), 0.f);
}

View File

@ -47,9 +47,10 @@ std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
AvailableCpuFeatures available = GetAvailableCpuFeatures();
if (available.avx2) {
AvailableCpuFeatures features(
{/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
v.push_back(features);
v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
}
if (available.sse2) {
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
}
return v;
}