Fix simd intruction not compatible with platforms

This commit is contained in:
DengzhiLiu
2024-03-04 11:45:23 +00:00
committed by ob-robot
parent c71007cee0
commit 4bb2e4280e
3 changed files with 138 additions and 87 deletions

View File

@ -284,12 +284,6 @@ ob_set_subtarget(oblib_lib_bitmap common
)
ob_lib_add_target(oblib_lib_bitmap)
if (${ARCHITECTURE} STREQUAL "x86_64")
target_compile_options(oblib_lib_bitmap
PRIVATE
-mbmi2
)
endif()
ob_set_subtarget(ob_malloc_object_list common_alloc
alloc/abit_set.cpp

View File

@ -10,57 +10,91 @@
* See the Mulan PubL v2 for more details.
*/
#if defined(__SSE2__)
#include <emmintrin.h>
#endif
#if defined(__AVX512F__) || defined(__AVX512BW__) || defined(__AVX__) || defined(__AVX2__) || defined(__BMI2__)
#include <immintrin.h>
#endif
#include "lib/container/ob_bitmap.h"
#include "common/ob_target_specific.h"
#if OB_USE_MULTITARGET_CODE
#include <emmintrin.h>
#include <immintrin.h>
#endif
namespace oceanbase
{
namespace common
{
// Transform 64-byte mask to 64-bit mask
OB_INLINE static uint64_t bytes64mask_to_bits64mask(
OB_DECLARE_AVX512_SPECIFIC_CODE(
inline static uint64_t bytes64mask_to_bits64mask(
const uint8_t *bytes64,
const bool need_flip = false)
{
#if defined(__AVX512F__) && defined(__AVX512BW__)
const __m512i vbytes = _mm512_loadu_si512(reinterpret_cast<const void *>(bytes64));
uint64_t res = _mm512_testn_epi8_mask(vbytes, vbytes);
#elif defined(__AVX__) && defined(__AVX2__)
const __m256i zero32 = _mm256_setzero_si256();
uint64_t res =
(static_cast<uint64_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(bytes64)), zero32))) & 0xffffffff)
| (static_cast<uint64_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(bytes64 + 32)), zero32))) << 32);
#elif defined(__SSE2__)
const __m128i zero16 = _mm_setzero_si128();
uint64_t res =
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64)), zero16))) & 0xffff)
| ((static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 16)), zero16))) << 16) & 0xffff0000)
| ((static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 32)), zero16))) << 32) & 0xffff00000000)
| ((static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 48)), zero16))) << 48) & 0xffff000000000000);
#else
uint64_t res = 0;
for (int64_t i = 0; i < 64; ++i) {
res |= static_cast<uint64_t>(0 == bytes64[i]) << i;
}
#endif
if (!need_flip) {
res = ~res;
}
return res;
}
)
OB_DECLARE_AVX2_SPECIFIC_CODE(
inline static uint64_t bytes64mask_to_bits64mask(
const uint8_t *bytes64,
const bool need_flip = false)
{
const __m256i zero32 = _mm256_setzero_si256();
uint64_t res =
(static_cast<uint64_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(bytes64)), zero32))) & 0xffffffff)
| (static_cast<uint64_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(bytes64 + 32)), zero32))) << 32);
if (!need_flip) {
res = ~res;
}
return res;
}
)
OB_DECLARE_SSE42_SPECIFIC_CODE(
inline static uint64_t bytes64mask_to_bits64mask(
const uint8_t *bytes64,
const bool need_flip = false)
{
const __m128i zero16 = _mm_setzero_si128();
uint64_t res =
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64)), zero16))) & 0xffff)
| ((static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 16)), zero16))) << 16) & 0xffff0000)
| ((static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 32)), zero16))) << 32) & 0xffff00000000)
| ((static_cast<uint64_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 48)), zero16))) << 48) & 0xffff000000000000);
if (!need_flip) {
res = ~res;
}
return res;
}
)
OB_DECLARE_DEFAULT_CODE(
inline static uint64_t bytes64mask_to_bits64mask(
const uint8_t *bytes64,
const bool need_flip = false)
{
uint64_t res = 0;
for (int64_t i = 0; i < 64; ++i) {
res |= static_cast<uint64_t>(0 == bytes64[i]) << i;
}
if (!need_flip) {
res = ~res;
}
return res;
}
)
OB_INLINE static uint8_t is_bit_set(
const uint8_t *src_byte,
@ -260,9 +294,7 @@ inline static void bitmap_to_bits_mask(
}
}
}
)
OB_DECLARE_DEFAULT_CODE(
inline static void uint64_mask_to_bits_mask(
const uint64_t *data,
const int64_t size,
@ -470,6 +502,8 @@ int ObBitmap::bit_and(const ObBitmap &right)
#if OB_USE_MULTITARGET_CODE
} else if (common::is_arch_supported(ObTargetArch::AVX2)) {
SelectOpImpl<SelectAndOp>::apply_op_avx2(data_, right.data_, valid_bytes_);
} else if (common::is_arch_supported(ObTargetArch::SSE42)) {
SelectOpImpl<SelectAndOp>::apply_op_sse42(data_, right.data_, valid_bytes_);
#endif
} else {
SelectOpImpl<SelectAndOp>::apply_op(data_, right.data_, valid_bytes_);
@ -489,6 +523,8 @@ int ObBitmap::bit_or(const ObBitmap &right)
#if OB_USE_MULTITARGET_CODE
} else if (common::is_arch_supported(ObTargetArch::AVX2)) {
SelectOpImpl<SelectOrOp>::apply_op_avx2(data_, right.data_, valid_bytes_);
} else if (common::is_arch_supported(ObTargetArch::SSE42)) {
SelectOpImpl<SelectOrOp>::apply_op_sse42(data_, right.data_, valid_bytes_);
#endif
} else {
SelectOpImpl<SelectOrOp>::apply_op(data_, right.data_, valid_bytes_);
@ -505,6 +541,8 @@ int ObBitmap::bit_not()
#if OB_USE_MULTITARGET_CODE
} else if (common::is_arch_supported(ObTargetArch::AVX2)) {
SelectOpImpl<SelectNotOp>::apply_not_op_avx2(data_, valid_bytes_);
} else if (common::is_arch_supported(ObTargetArch::SSE42)) {
SelectOpImpl<SelectNotOp>::apply_not_op_sse42(data_, valid_bytes_);
#endif
} else {
SelectOpImpl<SelectNotOp>::apply_not_op(data_, valid_bytes_);
@ -657,6 +695,61 @@ int ObBitmap::set_bitmap_batch(const int64_t offset, const int64_t count, const
return ret;
}
OB_DECLARE_AVX2_SPECIFIC_CODE(
inline static void inner_from_bits_mask(
const int64_t from,
const int64_t to,
uint8_t* bits,
uint8_t* data)
{
const uint64_t size = to - from;
const uint8_t *pos = bits;
const uint8_t *end_pos32 = pos + size / 32 * 4;
uint8_t *out = data + from;
for (; pos < end_pos32; pos += 4) {
// we only use the low 32bits of each lane, but this is fine with AVX2
__m256i xbcast = _mm256_set1_epi32(*(reinterpret_cast<const int32_t *>(pos)));
// Each byte gets the source byte containing the corresponding bit
__m256i shufmask = _mm256_set_epi64x(
0x0303030303030303, 0x0202020202020202,
0x0101010101010101, 0x0000000000000000);
__m256i shuf = _mm256_shuffle_epi8(xbcast, shufmask);
__m256i andmask = _mm256_set1_epi64x(0x8040201008040201); // every 8 bits -> 8 bytes, pattern repeats.
__m256i isolated_inverted = _mm256_andnot_si256(shuf, andmask);
// this is the extra step: compare each byte == 0 to produce 0 or -1
__m256i z = _mm256_cmpeq_epi8(isolated_inverted, _mm256_setzero_si256());
// alternative: compare against the AND mask to get 0 or -1,
// avoiding the need for a vector zero constant.
_mm256_storeu_si256((__m256i*)out,z);
out += 32;
}
const int64_t remain_size = (to - from) % 32;
for (int64_t idx = 0; idx < remain_size; ++idx) {
*(out++) = is_bit_set(pos, idx);
}
}
)
OB_DECLARE_DEFAULT_CODE(
inline static void inner_from_bits_mask(
const int64_t from,
const int64_t to,
uint8_t* bits,
uint8_t* data)
{
const uint64_t size = to - from;
uint8_t *out = data + from;
uint64_t *bits64 = reinterpret_cast<uint64_t *>(bits);
for (uint64_t i = 0; i < size; ++i) {
if (bits64[i / 64] & (1LU << (i % 64))) {
*out = 1;
}
++out;
}
}
)
int ObBitmap::from_bits_mask(
const int64_t from,
const int64_t to,
@ -666,55 +759,14 @@ int ObBitmap::from_bits_mask(
if (OB_UNLIKELY(valid_bytes_ < to || nullptr == bits)) {
ret = OB_INVALID_ARGUMENT;
LIB_LOG(WARN, "Invalid argument", K_(valid_bytes), K(to), KP(bits));
} else {
const uint64_t size = to - from;
#if defined(__BMI2__) && defined(__AVX512BW__) && defined(__AVX512F__)
const uint8_t *pos = bits;
const uint8_t *end_pos64 = pos + size / 64 * 8;
const uint8_t *end_pos8 = pos + size / 8;
uint8_t *out = data_ + from;
for (; pos < end_pos64; pos += 8) {
const int64_t *bit_mask = reinterpret_cast<const int64_t *>(pos);
__m512i zeros = _mm512_set1_epi64(0);
__m512i ones = _mm512_set1_epi64(0x0101010101010101ULL);
__m512i z = _mm512_mask_blend_epi8(*bit_mask, zeros, ones);
_mm512_storeu_si512((__m512i*)out, z);
out += 64;
}
for (; pos < end_pos8; ++pos) {
uint64_t *out64 = reinterpret_cast<uint64_t *>(out);
*out64 = static_cast<uint64_t>(_pdep_u64(static_cast<uint64_t>(*pos), 0x0101010101010101ULL));
out += 8;
}
const int64_t remain_size = (to - from) % 8;
for (int64_t idx = 0; idx < remain_size; ++idx) {
*(out++) = is_bit_set(pos, idx);
}
#elif defined(__BMI2__)
const uint8_t *pos = bits;
const uint8_t *end_pos8 = pos + size / 8;
uint8_t *out = data_ + from;
for (; pos < end_pos8; ++pos) {
uint64_t *out64 = reinterpret_cast<uint64_t *>(out);
*out64 = static_cast<uint64_t>(_pdep_u64(static_cast<uint64_t>(*pos), 0x0101010101010101ULL));
out += 8;
}
const int64_t remain_size = (to - from) % 8;
for (int64_t idx = 0; idx < remain_size; ++idx) {
*(out++) = is_bit_set(pos, idx);
}
#else
// TODO(hanling): Optimize the scenario that SIMD and BMI2 instructions are not supported in the future.
uint8_t *out = data_ + from;
uint64_t *bits64 = reinterpret_cast<uint64_t *>(bits);
for (uint64_t i = 0; i < size; ++i) {
if (bits64[i / 64] & (1LU << (i % 64))) {
*out = 1;
}
++out;
}
#if OB_USE_MULTITARGET_CODE
} else if (common::is_arch_supported(ObTargetArch::AVX2)) {
common::specific::avx2::inner_from_bits_mask(from, to, bits, data_);
#endif
} else {
common::specific::normal::inner_from_bits_mask(from, to, bits, data_);
}
return ret;
}
@ -747,6 +799,8 @@ void ObBitmap::filter(
{
if (!has_null) {
#if OB_USE_MULTITARGET_CODE
} else if (common::is_arch_supported(ObTargetArch::AVX2)) {
SelectOpImpl<SelectOrOp>::apply_op_avx2(skip, nulls, (size + 7) / 8);
} else if (common::is_arch_supported(ObTargetArch::SSE42)) {
SelectOpImpl<SelectOrOp>::apply_op_sse42(skip, nulls, (size + 7) / 8);
#endif
@ -757,6 +811,8 @@ void ObBitmap::filter(
#if OB_USE_MULTITARGET_CODE
if (common::is_arch_supported(ObTargetArch::AVX512)) {
common::specific::avx512::uint64_mask_to_bits_mask(data, size, skip);
} else if (common::is_arch_supported(ObTargetArch::AVX2)) {
common::specific::avx2::uint64_mask_to_bits_mask(data, size, skip);
} else {
#endif
common::specific::normal::uint64_mask_to_bits_mask(data, size, skip);

View File

@ -58,6 +58,7 @@ OB_INLINE uint64_t countl_zero64(uint64_t mask)
return __builtin_clzll(mask);
}
//TODO: use template to avoid branch prediction in simd
class ObBitmap
{
public: