Files
doris/be/src/util/simd/bits.h

154 lines
6.0 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <vector>
#include "util/sse_util.hpp"
namespace doris {
namespace simd {
/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask
/// Transform 32-byte mask to 32-bit mask
inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
#ifdef __AVX2__
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
#elif defined(__SSE2__) || defined(__aarch64__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
(static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data)), zero16)))) |
((static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 16)), zero16)))
<< 16) &
0xffff0000);
#else
uint32_t mask = 0;
for (std::size_t i = 0; i < 32; ++i) {
mask |= static_cast<uint32_t>(1 == *(data + i)) << i;
}
#endif
return mask;
}
inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) {
return bytes32_mask_to_bits32_mask(reinterpret_cast<const uint8_t*>(data));
}
inline size_t count_zero_num(const int8_t* __restrict data, size_t size) {
size_t num = 0;
const int8_t* end = data + size;
#if defined(__SSE2__) && defined(__POPCNT__)
const __m128i zero16 = _mm_setzero_si128();
const int8_t* end64 = data + (size / 64 * 64);
for (; data < end64; data += 64) {
num += __builtin_popcountll(
static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data)), zero16))) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 16)), zero16)))
<< 16u) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 32)), zero16)))
<< 32u) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 48)), zero16)))
<< 48u));
}
#endif
for (; data < end; ++data) {
num += (*data == 0);
}
return num;
}
inline size_t count_zero_num(const int8_t* __restrict data, const uint8_t* __restrict null_map,
size_t size) {
size_t num = 0;
const int8_t* end = data + size;
#if defined(__SSE2__) && defined(__POPCNT__)
const __m128i zero16 = _mm_setzero_si128();
const int8_t* end64 = data + (size / 64 * 64);
for (; data < end64; data += 64) {
num += __builtin_popcountll(
static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(data)),
zero16),
_mm_loadu_si128(reinterpret_cast<const __m128i*>(null_map))))) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 16)),
zero16),
_mm_loadu_si128(reinterpret_cast<const __m128i*>(null_map + 16)))))
<< 16u) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 32)),
zero16),
_mm_loadu_si128(reinterpret_cast<const __m128i*>(null_map + 32)))))
<< 32u) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 48)),
zero16),
_mm_loadu_si128(reinterpret_cast<const __m128i*>(null_map + 48)))))
<< 48u));
}
#endif
for (; data < end; ++data, ++null_map) {
num += ((*data == 0) | *null_map);
}
return num;
}
// TODO: compare with different SIMD implements
template <class T>
static size_t find_byte(const std::vector<T>& vec, size_t start, T byte) {
if (start >= vec.size()) {
return start;
}
const void* p = std::memchr((const void*)(vec.data() + start), byte, vec.size() - start);
if (p == nullptr) {
return vec.size();
}
return (T*)p - vec.data();
}
template <typename T>
bool contain_byte(const T* __restrict data, const size_t length, const signed char byte) {
return nullptr != std::memchr(reinterpret_cast<const void*>(data), byte, length);
}
inline size_t find_one(const std::vector<uint8_t>& vec, size_t start) {
return find_byte<uint8_t>(vec, start, 1);
}
inline size_t find_zero(const std::vector<uint8_t>& vec, size_t start) {
return find_byte<uint8_t>(vec, start, 0);
}
} // namespace simd
} // namespace doris