[enhancement] support simd instructions on arm cpus through sse2neon (#10068)

* [enhancement] support simd instructions on arm cpus through sse2neon
2022-06-14 09:17:09 +08:00
parent 7cf0cc7dd6
commit 39a2785ce2
20 changed files with 105 additions and 3176 deletions
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@ -412,7 +412,7 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
        set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
    endif()
 endif()
-set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}  -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
+set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")

 if (WITH_MYSQL)
    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DDORIS_WITH_MYSQL")
--- a/be/src/exprs/block_bloom_filter_impl.cc
+++ b/be/src/exprs/block_bloom_filter_impl.cc
@ -20,8 +20,8 @@
 // and modified by Doris

 #ifdef __aarch64__
-#include "util/sse2neon.h"
-#else //__aarch64__
+#include <sse2neon.h>
+#else
 #include <emmintrin.h>
 #include <mm_malloc.h>
 #endif
@ -115,16 +115,10 @@ void BlockBloomFilter::bucket_insert(const uint32_t bucket_idx, const uint32_t h
        new_bucket[i] = 1U << new_bucket[i];
    }
    for (int i = 0; i < 2; ++i) {
-#ifdef __aarch64__
-        uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4 * i));
-        uint8x16_t* existing_bucket = reinterpret_cast<uint8x16_t*>(&_directory[bucket_idx][4 * i]);
-        *existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
-#else
        __m128i new_bucket_sse = _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
        __m128i* existing_bucket =
                reinterpret_cast<__m128i*>(&DCHECK_NOTNULL(_directory)[bucket_idx][4 * i]);
        *existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
-#endif
    }
 }

@ -194,7 +188,7 @@ Status BlockBloomFilter::or_equal_array(size_t n, const uint8_t* __restrict__ in

 void BlockBloomFilter::or_equal_array_no_avx2(size_t n, const uint8_t* __restrict__ in,
                                              uint8_t* __restrict__ out) {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
    // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
    // written in a way that is very friendly to auto-vectorization. Instead, we manually
    // vectorize, increasing the speed by up to 56x.
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@ -26,7 +26,7 @@
 #include "gutil/bits.h"
 #include "util/cpu_info.h"
 #ifdef __aarch64__
-#include "sse2neon.h"
+#include <sse2neon.h>
 #else
 #include <emmintrin.h>
 #include <immintrin.h>
--- a/be/src/util/crc32c.cpp
+++ b/be/src/util/crc32c.cpp
@ -19,8 +19,10 @@
 // https://github.com/facebook/rocksdb/blob/master/util/crc32c.cc

 #include "util/crc32c.h"
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 #include <nmmintrin.h>
+#elif defined(__aarch64__)
+#include <sse2neon.h>
 #endif
 #include "util/coding.h"

@ -204,9 +206,8 @@ static inline uint64_t LE_LOAD64(const uint8_t* p) {
 }

 static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
-#ifndef __SSE4_2__
-    Slow_CRC32(l, p);
-#elif defined(__LP64__) || defined(_WIN64)
+#if defined(__SSE4_2__) || defined(__aarch64__)
+#if (defined(__LP64__) || defined(_WIN64)) && !defined(__aarch64__)
    *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
    *p += 8;
 #else
@ -215,6 +216,9 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
    *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
    *p += 4;
 #endif
+#else
+    Slow_CRC32(l, p);
+#endif
 }

 template <void (*CRC32)(uint64_t*, uint8_t const**)>
@ -261,7 +265,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
 }

 uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
    return ExtendImpl<Fast_CRC32>(crc, buf, size);
 #else
    return ExtendImpl<Slow_CRC32>(crc, buf, size);
--- a/be/src/util/hash_util.hpp
+++ b/be/src/util/hash_util.hpp
@ -29,6 +29,8 @@
 // the code that is built and the runtime checks to control what code is run.
 #ifdef __SSE4_2__
 #include <nmmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
 #endif
 #include <zlib.h>

@ -44,7 +46,7 @@ public:
    static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) {
        return crc32(hash, (const unsigned char*)data, bytes);
    }
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
    // Compute the Crc32 hash for data using SSE4 instructions.  The input hash parameter is
    // the current hash/seed value.
    // This should only be called if SSE is supported.
--- a/be/src/util/simd/bits.h
+++ b/be/src/util/simd/bits.h
@ -23,6 +23,8 @@
 #include <immintrin.h>
 #elif __SSE2__
 #include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
 #endif

 namespace doris {
@ -35,7 +37,7 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
    auto zero32 = _mm256_setzero_si256();
    uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
            _mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
-#elif __SSE2__
+#elif defined(__SSE2__) || defined(__aarch64__)
    auto zero16 = _mm_setzero_si128();
    uint32_t mask =
            (static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
--- a/be/src/util/simd/lower_upper_impl.h
+++ b/be/src/util/simd/lower_upper_impl.h
@ -19,6 +19,8 @@

 #ifdef __SSE2__
 #include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
 #endif
 #include <stdint.h>

@ -35,7 +37,7 @@ public:
    static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) {
        const auto flip_case_mask = 'A' ^ 'a';

-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
        const auto bytes_sse = sizeof(__m128i);
        const auto src_end_sse = src_end - (src_end - src) % bytes_sse;

--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@ -21,6 +21,10 @@

 #include <cstdint>

+#ifdef __aarch64__
+#include <sse2neon.h>
+#endif
+
 #include "runtime/string_value.hpp"
 #include "util/simd/lower_upper_impl.h"

@ -48,7 +52,7 @@ namespace simd {

 class VStringFunctions {
 public:
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
    /// n equals to 16 chars length
    static constexpr auto REGISTER_SIZE = sizeof(__m128i);
 #endif
@ -59,7 +63,7 @@ public:
        }
        auto begin = 0;
        auto end = str.len - 1;
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
        char blank = ' ';
        const auto pattern = _mm_set1_epi8(blank);
        while (end - begin + 1 >= REGISTER_SIZE) {
@ -91,7 +95,7 @@ public:
        }
        auto begin = 0;
        auto end = str.len - 1;
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
        char blank = ' ';
        const auto pattern = _mm_set1_epi8(blank);
        while (end - begin + 1 >= REGISTER_SIZE) {
@ -155,7 +159,7 @@ public:
        static constexpr auto hex_table = "0123456789ABCDEF";
        auto src_str_end = src_str + length;

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__aarch64__)
        constexpr auto step = sizeof(uint64);
        if (src_str + step < src_str_end) {
            const auto hex_map = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hex_table));
--- a/be/src/util/sse2neon.h
+++ b/be/src/util/sse2neon.h
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@ -20,6 +20,10 @@

 #pragma once

+#ifdef __aarch64__
+#include <sse2neon.h>
+#endif
+
 #include "vec/columns/column.h"
 #include "vec/columns/column_impl.h"
 #include "vec/columns/columns_number.h"
@ -222,7 +226,7 @@ public:
    bool has_null(size_t size) const override {
        const UInt8* null_pos = get_null_map_data().data();
        const UInt8* null_pos_end = get_null_map_data().data() + size;
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__aarch64__)
        /** A slightly more optimized version.
        * Based on the assumption that often pieces of consecutive values
        *  completely pass or do not pass the filter.
--- a/be/src/vec/columns/columns_common.cpp
+++ b/be/src/vec/columns/columns_common.cpp
@ -18,9 +18,12 @@
 // https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnsCommon.cpp
 // and modified by Doris

-#ifdef __SSE2__
+#if defined(__SSE2__)
 #include <emmintrin.h>
 #endif
+#if defined(__aarch64__)
+#include <sse2neon.h>
+#endif

 #include "util/simd/bits.h"
 #include "vec/columns/column.h"
@ -41,7 +44,7 @@ size_t count_bytes_in_filter(const IColumn::Filter& filt) {
    const Int8* pos = reinterpret_cast<const Int8*>(filt.data());
    const Int8* end = pos + filt.size();

-#if defined(__SSE2__) && defined(__POPCNT__)
+#if defined(__SSE2__) || defined(__aarch64__) && defined(__POPCNT__)
    const __m128i zero16 = _mm_setzero_si128();
    const Int8* end64 = pos + filt.size() / 64 * 64;

@ -62,7 +65,9 @@ size_t count_bytes_in_filter(const IColumn::Filter& filt) {
        /// TODO Add duff device for tail?
 #endif

-    for (; pos < end; ++pos) count += *pos > 0;
+    for (; pos < end; ++pos) {
+        count += *pos > 0;
+    }

    return count;
 }
--- a/be/src/vec/common/hash_table/hash.h
+++ b/be/src/vec/common/hash_table/hash.h
@ -59,16 +59,13 @@ inline doris::vectorized::UInt64 int_hash64(doris::vectorized::UInt64 x) {
 #include <nmmintrin.h>
 #endif

-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-#include <arm_acle.h>
-#include <arm_neon.h>
+#if defined(__aarch64__)
+#include <sse2neon.h>
 #endif

 inline doris::vectorized::UInt64 int_hash_crc32(doris::vectorized::UInt64 x) {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
    return _mm_crc32_u64(-1ULL, x);
-#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    return __crc32cd(-1U, x);
 #else
    /// On other platforms we do not have CRC32. NOTE This can be confusing.
    return int_hash64(x);
@ -143,7 +140,7 @@ DEFINE_HASH(doris::vectorized::Float64)
 template <>
 struct HashCRC32<doris::vectorized::UInt256> {
    size_t operator()(const doris::vectorized::UInt256& x) const {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)
        doris::vectorized::UInt64 crc = -1ULL;
        crc = _mm_crc32_u64(crc, x.a);
        crc = _mm_crc32_u64(crc, x.b);
--- a/be/src/vec/common/memcmp_small.h
+++ b/be/src/vec/common/memcmp_small.h
@ -38,8 +38,12 @@ inline int cmp(T a, T b) {
 /// Results don't depend on the values inside uninitialized memory but Memory Sanitizer cannot see it.
 /// Disable optimized functions if compile with Memory Sanitizer.

-#if defined(__SSE2__) && !defined(MEMORY_SANITIZER)
+#if (defined(__SSE2__) || defined(__aarch64__)) && !defined(MEMORY_SANITIZER)
+#ifdef __SSE2__
 #include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
+#endif

 /** All functions works under the following assumptions:
  * - it's possible to read up to 15 excessive bytes after end of 'a' and 'b' region;
--- a/be/src/vec/common/memcpy_small.h
+++ b/be/src/vec/common/memcpy_small.h
@ -22,8 +22,12 @@

 #include <string.h>

+#if defined(__SSE2__) || defined(__aarch64__)
 #ifdef __SSE2__
 #include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
+#endif

 /** memcpy function could work suboptimal if all the following conditions are met:
  * 1. Size of memory region is relatively small (approximately, under 50 bytes).
--- a/be/src/vec/common/string_ref.h
+++ b/be/src/vec/common/string_ref.h
@ -42,6 +42,10 @@
 #include <smmintrin.h>
 #endif

+#if defined(__aarch64__)
+#include <sse2neon.h>
+#endif
+
 /// The thing to avoid creating strings to find substrings in the hash table.
 struct StringRef {
    const char* data = nullptr;
@ -73,7 +77,7 @@ struct StringRef {

 using StringRefs = std::vector<StringRef>;

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__aarch64__)

 /** Compare strings for equality.
  * The approach is controversial and does not win in all cases.
@ -164,7 +168,7 @@ inline bool operator==(StringRef lhs, StringRef rhs) {

    if (lhs.size == 0) return true;

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__aarch64__)
    return memequalSSE2Wide(lhs.data, rhs.data, lhs.size);
 #else
    return 0 == memcmp(lhs.data, rhs.data, lhs.size);
@ -197,7 +201,7 @@ struct StringRefHash64 {
    size_t operator()(StringRef x) const { return util_hash::CityHash64(x.data, x.size); }
 };

-#if defined(__SSE4_2__)
+#if defined(__SSE4_2__) || defined(__aarch64__)

 /// Parts are taken from CityHash.

--- a/be/src/vec/common/uint128.h
+++ b/be/src/vec/common/uint128.h
@ -28,10 +28,14 @@
 #include "gutil/hash/hash128to64.h"
 #include "vec/core/types.h"

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 #include <nmmintrin.h>
 #endif

+#if defined(__aarch64__)
+#include <sse2neon.h>
+#endif
+
 namespace doris::vectorized {

 /// For aggregation by SipHash, UUID type or concatenation of several fields.
@ -146,7 +150,7 @@ struct UInt128Hash {
    size_t operator()(UInt128 x) const { return Hash128to64({x.low, x.high}); }
 };

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__) || defined(__aarch64__)

 struct UInt128HashCRC32 {
    size_t operator()(UInt128 x) const {
--- a/build.sh
+++ b/build.sh
@ -301,7 +301,7 @@ if [ ${BUILD_BE} -eq 1 ] ; then
            -DUSE_DWARF=${USE_DWARF} \
            -DUSE_MEM_TRACKER=${USE_MEM_TRACKER} \
            -DUSE_AVX2=${USE_AVX2} \
-            -DGLIBC_COMPATIBILITY=${GLIBC_COMPATIBILITY} ../
+            -DGLIBC_COMPATIBILITY=${GLIBC_COMPATIBILITY} ${DORIS_HOME}/be/
    ${BUILD_SYSTEM} -j ${PARALLEL}
    ${BUILD_SYSTEM} install
    cd ${DORIS_HOME}
--- a/run-be-ut.sh
+++ b/run-be-ut.sh
@ -138,7 +138,7 @@ ${CMAKE_CMD} -G "${GENERATOR}" \
    -DWITH_MYSQL=OFF \
    -DUSE_DWARF=${USE_DWARF} \
    -DUSE_MEM_TRACKER=ON \
-    ${CMAKE_USE_CCACHE} ../
+    ${CMAKE_USE_CCACHE} ${DORIS_HOME}/be/
 ${BUILD_SYSTEM} -j ${PARALLEL}

 if [ ${RUN} -ne 1 ]; then
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@ -507,16 +507,21 @@ build_re2() {

 # hyperscan
 build_hyperscan() {
-    check_if_source_exist $RAGEL_SOURCE
-    cd $TP_SOURCE_DIR/$RAGEL_SOURCE
-    ./configure --prefix=$TP_INSTALL_DIR && make install
+    MACHINE_TYPE=$(uname -m)
+    if [[ "${MACHINE_TYPE}" == "aarch64" ]]; then
+        echo "hyperscan is not supporting aarch64 now."
+    else
+        check_if_source_exist $RAGEL_SOURCE
+        cd $TP_SOURCE_DIR/$RAGEL_SOURCE
+        ./configure --prefix=$TP_INSTALL_DIR && make install

-    check_if_source_exist $HYPERSCAN_SOURCE
-    cd $TP_SOURCE_DIR/$HYPERSCAN_SOURCE
-    mkdir -p $BUILD_DIR && cd $BUILD_DIR
-    PATH=$TP_INSTALL_DIR/bin:$PATH ${CMAKE_CMD} -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 \
-    -DBOOST_ROOT=$BOOST_SOURCE -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR ..
-    ${BUILD_SYSTEM} -j $PARALLEL install
+        check_if_source_exist $HYPERSCAN_SOURCE
+        cd $TP_SOURCE_DIR/$HYPERSCAN_SOURCE
+        mkdir -p $BUILD_DIR && cd $BUILD_DIR
+        PATH=$TP_INSTALL_DIR/bin:$PATH ${CMAKE_CMD} -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 \
+        -DBOOST_ROOT=$BOOST_SOURCE -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR ..
+        ${BUILD_SYSTEM} -j $PARALLEL install
+    fi
 }

 # boost
@ -1019,6 +1024,13 @@ build_opentelemetry() {
    ${BUILD_SYSTEM} -j $PARALLEL && ${BUILD_SYSTEM} install
 }

+# sse2neon
+build_sse2neon() {
+    check_if_source_exist $SSE2NEON_SOURCE
+    cd $TP_SOURCE_DIR/$SSE2NEON_SOURCE
+    cp sse2neon.h $TP_INSTALL_DIR/include/
+}
+
 build_libunixodbc
 build_openssl
 build_libevent
@ -1037,7 +1049,7 @@ build_snappy
 build_gperftools
 build_curl
 build_re2
-build_hyperscan
+# build_hyperscan
 build_thrift
 build_leveldb
 build_brpc
@ -1070,6 +1082,7 @@ build_simdjson
 build_nlohmann_json
 build_opentelemetry
 build_libbacktrace
+build_sse2neon

 echo "Finished to build all thirdparties"

--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@ -405,6 +405,14 @@ LIBBACKTRACE_NAME=libbacktrace-2446c66076480ce07a6bd868badcbceb3eeecc2e.zip
 LIBBACKTRACE_SOURCE=libbacktrace-2446c66076480ce07a6bd868badcbceb3eeecc2e
 LIBBACKTRACE_MD5SUM="6c79a8012870a24610c0d9c3621b23fe"

+# sse2noen
+SSE2NEON_DOWNLOAD="https://github.com/DLTcollab/sse2neon/archive/refs/tags/v1.5.1.tar.gz"
+SSE2NEON_NAME=sse2neon-1.5.1.tar.gz
+SSE2NEON_SOURCE=sse2neon-1.5.1
+SSE2NEON_MD5SUM="9de5dc2970aa7efac7faee59e2826c51"
+
+
+
 # all thirdparties which need to be downloaded is set in array TP_ARCHIVES
 export TP_ARCHIVES="LIBEVENT
 OPENSSL
@ -463,4 +471,5 @@ SIMDJSON
 NLOHMANN_JSON
 OPENTELEMETRY_PROTO
 OPENTELEMETRY
-LIBBACKTRACE"
+LIBBACKTRACE
+SSE2NEON"