[enhancement] support simd instructions on arm cpus through sse2neon (#10068)

* [enhancement] support simd instructions on arm cpus through sse2neon
This commit is contained in:
Zhengguo Yang
2022-06-14 09:17:09 +08:00
committed by GitHub
parent 7cf0cc7dd6
commit 39a2785ce2
20 changed files with 105 additions and 3176 deletions

View File

@ -412,7 +412,7 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
endif()
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
if (WITH_MYSQL)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DDORIS_WITH_MYSQL")

View File

@ -20,8 +20,8 @@
// and modified by Doris
#ifdef __aarch64__
#include "util/sse2neon.h"
#else //__aarch64__
#include <sse2neon.h>
#else
#include <emmintrin.h>
#include <mm_malloc.h>
#endif
@ -115,16 +115,10 @@ void BlockBloomFilter::bucket_insert(const uint32_t bucket_idx, const uint32_t h
new_bucket[i] = 1U << new_bucket[i];
}
for (int i = 0; i < 2; ++i) {
#ifdef __aarch64__
uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4 * i));
uint8x16_t* existing_bucket = reinterpret_cast<uint8x16_t*>(&_directory[bucket_idx][4 * i]);
*existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
#else
__m128i new_bucket_sse = _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
__m128i* existing_bucket =
reinterpret_cast<__m128i*>(&DCHECK_NOTNULL(_directory)[bucket_idx][4 * i]);
*existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
#endif
}
}
@ -194,7 +188,7 @@ Status BlockBloomFilter::or_equal_array(size_t n, const uint8_t* __restrict__ in
void BlockBloomFilter::or_equal_array_no_avx2(size_t n, const uint8_t* __restrict__ in,
uint8_t* __restrict__ out) {
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
// The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
// written in a way that is very friendly to auto-vectorization. Instead, we manually
// vectorize, increasing the speed by up to 56x.

View File

@ -26,7 +26,7 @@
#include "gutil/bits.h"
#include "util/cpu_info.h"
#ifdef __aarch64__
#include "sse2neon.h"
#include <sse2neon.h>
#else
#include <emmintrin.h>
#include <immintrin.h>

View File

@ -19,8 +19,10 @@
// https://github.com/facebook/rocksdb/blob/master/util/crc32c.cc
#include "util/crc32c.h"
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
#include <nmmintrin.h>
#elif defined(__aarch64__)
#include <sse2neon.h>
#endif
#include "util/coding.h"
@ -204,9 +206,8 @@ static inline uint64_t LE_LOAD64(const uint8_t* p) {
}
static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
#ifndef __SSE4_2__
Slow_CRC32(l, p);
#elif defined(__LP64__) || defined(_WIN64)
#if defined(__SSE4_2__) || defined(__aarch64__)
#if (defined(__LP64__) || defined(_WIN64)) && !defined(__aarch64__)
*l = _mm_crc32_u64(*l, LE_LOAD64(*p));
*p += 8;
#else
@ -215,6 +216,9 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
*l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
*p += 4;
#endif
#else
Slow_CRC32(l, p);
#endif
}
template <void (*CRC32)(uint64_t*, uint8_t const**)>
@ -261,7 +265,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
}
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
return ExtendImpl<Fast_CRC32>(crc, buf, size);
#else
return ExtendImpl<Slow_CRC32>(crc, buf, size);

View File

@ -29,6 +29,8 @@
// the code that is built and the runtime checks to control what code is run.
#ifdef __SSE4_2__
#include <nmmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
#include <zlib.h>
@ -44,7 +46,7 @@ public:
static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) {
return crc32(hash, (const unsigned char*)data, bytes);
}
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
// Compute the Crc32 hash for data using SSE4 instructions. The input hash parameter is
// the current hash/seed value.
// This should only be called if SSE is supported.

View File

@ -23,6 +23,8 @@
#include <immintrin.h>
#elif __SSE2__
#include <emmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
namespace doris {
@ -35,7 +37,7 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
#elif __SSE2__
#elif defined(__SSE2__) || defined(__aarch64__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
(static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(

View File

@ -19,6 +19,8 @@
#ifdef __SSE2__
#include <emmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
#include <stdint.h>
@ -35,7 +37,7 @@ public:
static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) {
const auto flip_case_mask = 'A' ^ 'a';
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
const auto bytes_sse = sizeof(__m128i);
const auto src_end_sse = src_end - (src_end - src) % bytes_sse;

View File

@ -21,6 +21,10 @@
#include <cstdint>
#ifdef __aarch64__
#include <sse2neon.h>
#endif
#include "runtime/string_value.hpp"
#include "util/simd/lower_upper_impl.h"
@ -48,7 +52,7 @@ namespace simd {
class VStringFunctions {
public:
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
/// n equals to 16 chars length
static constexpr auto REGISTER_SIZE = sizeof(__m128i);
#endif
@ -59,7 +63,7 @@ public:
}
auto begin = 0;
auto end = str.len - 1;
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
char blank = ' ';
const auto pattern = _mm_set1_epi8(blank);
while (end - begin + 1 >= REGISTER_SIZE) {
@ -91,7 +95,7 @@ public:
}
auto begin = 0;
auto end = str.len - 1;
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
char blank = ' ';
const auto pattern = _mm_set1_epi8(blank);
while (end - begin + 1 >= REGISTER_SIZE) {
@ -155,7 +159,7 @@ public:
static constexpr auto hex_table = "0123456789ABCDEF";
auto src_str_end = src_str + length;
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__aarch64__)
constexpr auto step = sizeof(uint64);
if (src_str + step < src_str_end) {
const auto hex_map = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hex_table));

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,10 @@
#pragma once
#ifdef __aarch64__
#include <sse2neon.h>
#endif
#include "vec/columns/column.h"
#include "vec/columns/column_impl.h"
#include "vec/columns/columns_number.h"
@ -222,7 +226,7 @@ public:
bool has_null(size_t size) const override {
const UInt8* null_pos = get_null_map_data().data();
const UInt8* null_pos_end = get_null_map_data().data() + size;
#ifdef __SSE2__
#if defined(__SSE2__) || defined(__aarch64__)
/** A slightly more optimized version.
* Based on the assumption that often pieces of consecutive values
* completely pass or do not pass the filter.

View File

@ -18,9 +18,12 @@
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnsCommon.cpp
// and modified by Doris
#ifdef __SSE2__
#if defined(__SSE2__)
#include <emmintrin.h>
#endif
#if defined(__aarch64__)
#include <sse2neon.h>
#endif
#include "util/simd/bits.h"
#include "vec/columns/column.h"
@ -41,7 +44,7 @@ size_t count_bytes_in_filter(const IColumn::Filter& filt) {
const Int8* pos = reinterpret_cast<const Int8*>(filt.data());
const Int8* end = pos + filt.size();
#if defined(__SSE2__) && defined(__POPCNT__)
#if defined(__SSE2__) || defined(__aarch64__) && defined(__POPCNT__)
const __m128i zero16 = _mm_setzero_si128();
const Int8* end64 = pos + filt.size() / 64 * 64;
@ -62,7 +65,9 @@ size_t count_bytes_in_filter(const IColumn::Filter& filt) {
/// TODO Add duff device for tail?
#endif
for (; pos < end; ++pos) count += *pos > 0;
for (; pos < end; ++pos) {
count += *pos > 0;
}
return count;
}

View File

@ -59,16 +59,13 @@ inline doris::vectorized::UInt64 int_hash64(doris::vectorized::UInt64 x) {
#include <nmmintrin.h>
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
#include <arm_acle.h>
#include <arm_neon.h>
#if defined(__aarch64__)
#include <sse2neon.h>
#endif
inline doris::vectorized::UInt64 int_hash_crc32(doris::vectorized::UInt64 x) {
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
return _mm_crc32_u64(-1ULL, x);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cd(-1U, x);
#else
/// On other platforms we do not have CRC32. NOTE This can be confusing.
return int_hash64(x);
@ -143,7 +140,7 @@ DEFINE_HASH(doris::vectorized::Float64)
template <>
struct HashCRC32<doris::vectorized::UInt256> {
size_t operator()(const doris::vectorized::UInt256& x) const {
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
doris::vectorized::UInt64 crc = -1ULL;
crc = _mm_crc32_u64(crc, x.a);
crc = _mm_crc32_u64(crc, x.b);

View File

@ -38,8 +38,12 @@ inline int cmp(T a, T b) {
/// Results don't depend on the values inside uninitialized memory but Memory Sanitizer cannot see it.
/// Disable optimized functions if compile with Memory Sanitizer.
#if defined(__SSE2__) && !defined(MEMORY_SANITIZER)
#if (defined(__SSE2__) || defined(__aarch64__)) && !defined(MEMORY_SANITIZER)
#ifdef __SSE2__
#include <emmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
/** All functions works under the following assumptions:
* - it's possible to read up to 15 excessive bytes after end of 'a' and 'b' region;

View File

@ -22,8 +22,12 @@
#include <string.h>
#if defined(__SSE2__) || defined(__aarch64__)
#ifdef __SSE2__
#include <emmintrin.h>
#elif __aarch64__
#include <sse2neon.h>
#endif
/** memcpy function could work suboptimal if all the following conditions are met:
* 1. Size of memory region is relatively small (approximately, under 50 bytes).

View File

@ -42,6 +42,10 @@
#include <smmintrin.h>
#endif
#if defined(__aarch64__)
#include <sse2neon.h>
#endif
/// The thing to avoid creating strings to find substrings in the hash table.
struct StringRef {
const char* data = nullptr;
@ -73,7 +77,7 @@ struct StringRef {
using StringRefs = std::vector<StringRef>;
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__aarch64__)
/** Compare strings for equality.
* The approach is controversial and does not win in all cases.
@ -164,7 +168,7 @@ inline bool operator==(StringRef lhs, StringRef rhs) {
if (lhs.size == 0) return true;
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__aarch64__)
return memequalSSE2Wide(lhs.data, rhs.data, lhs.size);
#else
return 0 == memcmp(lhs.data, rhs.data, lhs.size);
@ -197,7 +201,7 @@ struct StringRefHash64 {
size_t operator()(StringRef x) const { return util_hash::CityHash64(x.data, x.size); }
};
#if defined(__SSE4_2__)
#if defined(__SSE4_2__) || defined(__aarch64__)
/// Parts are taken from CityHash.

View File

@ -28,10 +28,14 @@
#include "gutil/hash/hash128to64.h"
#include "vec/core/types.h"
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
#include <nmmintrin.h>
#endif
#if defined(__aarch64__)
#include <sse2neon.h>
#endif
namespace doris::vectorized {
/// For aggregation by SipHash, UUID type or concatenation of several fields.
@ -146,7 +150,7 @@ struct UInt128Hash {
size_t operator()(UInt128 x) const { return Hash128to64({x.low, x.high}); }
};
#ifdef __SSE4_2__
#if defined(__SSE4_2__) || defined(__aarch64__)
struct UInt128HashCRC32 {
size_t operator()(UInt128 x) const {

View File

@ -301,7 +301,7 @@ if [ ${BUILD_BE} -eq 1 ] ; then
-DUSE_DWARF=${USE_DWARF} \
-DUSE_MEM_TRACKER=${USE_MEM_TRACKER} \
-DUSE_AVX2=${USE_AVX2} \
-DGLIBC_COMPATIBILITY=${GLIBC_COMPATIBILITY} ../
-DGLIBC_COMPATIBILITY=${GLIBC_COMPATIBILITY} ${DORIS_HOME}/be/
${BUILD_SYSTEM} -j ${PARALLEL}
${BUILD_SYSTEM} install
cd ${DORIS_HOME}

View File

@ -138,7 +138,7 @@ ${CMAKE_CMD} -G "${GENERATOR}" \
-DWITH_MYSQL=OFF \
-DUSE_DWARF=${USE_DWARF} \
-DUSE_MEM_TRACKER=ON \
${CMAKE_USE_CCACHE} ../
${CMAKE_USE_CCACHE} ${DORIS_HOME}/be/
${BUILD_SYSTEM} -j ${PARALLEL}
if [ ${RUN} -ne 1 ]; then

View File

@ -507,16 +507,21 @@ build_re2() {
# hyperscan
build_hyperscan() {
check_if_source_exist $RAGEL_SOURCE
cd $TP_SOURCE_DIR/$RAGEL_SOURCE
./configure --prefix=$TP_INSTALL_DIR && make install
MACHINE_TYPE=$(uname -m)
if [[ "${MACHINE_TYPE}" == "aarch64" ]]; then
echo "hyperscan is not supporting aarch64 now."
else
check_if_source_exist $RAGEL_SOURCE
cd $TP_SOURCE_DIR/$RAGEL_SOURCE
./configure --prefix=$TP_INSTALL_DIR && make install
check_if_source_exist $HYPERSCAN_SOURCE
cd $TP_SOURCE_DIR/$HYPERSCAN_SOURCE
mkdir -p $BUILD_DIR && cd $BUILD_DIR
PATH=$TP_INSTALL_DIR/bin:$PATH ${CMAKE_CMD} -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 \
-DBOOST_ROOT=$BOOST_SOURCE -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR ..
${BUILD_SYSTEM} -j $PARALLEL install
check_if_source_exist $HYPERSCAN_SOURCE
cd $TP_SOURCE_DIR/$HYPERSCAN_SOURCE
mkdir -p $BUILD_DIR && cd $BUILD_DIR
PATH=$TP_INSTALL_DIR/bin:$PATH ${CMAKE_CMD} -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 \
-DBOOST_ROOT=$BOOST_SOURCE -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR ..
${BUILD_SYSTEM} -j $PARALLEL install
fi
}
# boost
@ -1019,6 +1024,13 @@ build_opentelemetry() {
${BUILD_SYSTEM} -j $PARALLEL && ${BUILD_SYSTEM} install
}
# sse2neon
build_sse2neon() {
check_if_source_exist $SSE2NEON_SOURCE
cd $TP_SOURCE_DIR/$SSE2NEON_SOURCE
cp sse2neon.h $TP_INSTALL_DIR/include/
}
build_libunixodbc
build_openssl
build_libevent
@ -1037,7 +1049,7 @@ build_snappy
build_gperftools
build_curl
build_re2
build_hyperscan
# build_hyperscan
build_thrift
build_leveldb
build_brpc
@ -1070,6 +1082,7 @@ build_simdjson
build_nlohmann_json
build_opentelemetry
build_libbacktrace
build_sse2neon
echo "Finished to build all thirdparties"

11
thirdparty/vars.sh vendored
View File

@ -405,6 +405,14 @@ LIBBACKTRACE_NAME=libbacktrace-2446c66076480ce07a6bd868badcbceb3eeecc2e.zip
LIBBACKTRACE_SOURCE=libbacktrace-2446c66076480ce07a6bd868badcbceb3eeecc2e
LIBBACKTRACE_MD5SUM="6c79a8012870a24610c0d9c3621b23fe"
# sse2noen
SSE2NEON_DOWNLOAD="https://github.com/DLTcollab/sse2neon/archive/refs/tags/v1.5.1.tar.gz"
SSE2NEON_NAME=sse2neon-1.5.1.tar.gz
SSE2NEON_SOURCE=sse2neon-1.5.1
SSE2NEON_MD5SUM="9de5dc2970aa7efac7faee59e2826c51"
# all thirdparties which need to be downloaded is set in array TP_ARCHIVES
export TP_ARCHIVES="LIBEVENT
OPENSSL
@ -463,4 +471,5 @@ SIMDJSON
NLOHMANN_JSON
OPENTELEMETRY_PROTO
OPENTELEMETRY
LIBBACKTRACE"
LIBBACKTRACE
SSE2NEON"