[improvement](bitmap) support version for ser/deser of bitmap (#23959)

This commit is contained in:
TengJianPing
2023-09-07 09:55:29 +08:00
committed by GitHub
parent a532a08944
commit 2f8b075b71
5 changed files with 186 additions and 78 deletions

View File

@ -1082,6 +1082,8 @@ DEFINE_Int32(fe_expire_duration_seconds, "60");
DEFINE_Int32(grace_shutdown_wait_seconds, "120");
DEFINE_Int16(bitmap_serialize_version, "1");
#ifdef BE_TEST
// test s3
DEFINE_String(test_s3_resource, "resource");

View File

@ -1150,6 +1150,9 @@ DECLARE_Int32(fe_expire_duration_seconds);
// During this period, FE will not send any queries to BE and waiting for all running queries to stop.
DECLARE_Int32(grace_shutdown_wait_seconds);
// BitmapValue serialize version.
DECLARE_Int16(bitmap_serialize_version);
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);

View File

@ -76,10 +76,14 @@ struct BitmapTypeCode {
//
// added in 0.12
BITMAP64 = 4,
SET = 5
SET = 5, // V1
SET_V2 = 10,
BITMAP32_V2 = 12,
BITMAP64_V2 = 13,
TYPE_MAX
};
Status static inline validate(int bitmap_type) {
if (UNLIKELY(bitmap_type < type::EMPTY || bitmap_type > type::SET)) {
if (UNLIKELY(bitmap_type < type::EMPTY || bitmap_type >= type::TYPE_MAX)) {
std::string err_msg =
fmt::format("BitmapTypeCode invalid, should between: {} and {} actrual is {}",
BitmapTypeCode::EMPTY, BitmapTypeCode::BITMAP64, bitmap_type);
@ -699,29 +703,35 @@ public:
* write a bitmap to a char buffer.
* Returns how many bytes were written which should be getSizeInBytes().
*/
size_t write(char* buf) const {
size_t write(char* buf, int serialize_version) const {
bool is_v1 = serialize_version == 1;
BitmapTypeCode::type type_bitmap32 =
is_v1 ? BitmapTypeCode::type::BITMAP32 : BitmapTypeCode::type::BITMAP32_V2;
BitmapTypeCode::type type_bitmap64 =
is_v1 ? BitmapTypeCode::type::BITMAP64 : BitmapTypeCode::type::BITMAP64_V2;
if (is32BitsEnough()) {
*(buf++) = BitmapTypeCode::type::BITMAP32;
*(buf++) = type_bitmap32;
auto it = roarings.find(0);
if (it == roarings.end()) { // empty bitmap
roaring::Roaring r;
return r.write(buf) + 1;
return r.write(buf, is_v1) + 1;
}
return it->second.write(buf) + 1;
return it->second.write(buf, is_v1) + 1;
}
const char* orig = buf;
// put type code
*(buf++) = BitmapTypeCode::type::BITMAP64;
*(buf++) = type_bitmap64;
// push map size
buf = (char*)encode_varint64((uint8_t*)buf, roarings.size());
std::for_each(roarings.cbegin(), roarings.cend(),
[&buf](const std::pair<const uint32_t, roaring::Roaring>& map_entry) {
[&buf, is_v1](const std::pair<const uint32_t, roaring::Roaring>& map_entry) {
// push map key
encode_fixed32_le((uint8_t*)buf, map_entry.first);
buf += sizeof(uint32_t);
// push map value Roaring
buf += map_entry.second.write(buf);
buf += map_entry.second.write(buf, is_v1);
});
return buf - orig;
}
@ -735,13 +745,16 @@ public:
static Roaring64Map read(const char* buf) {
Roaring64Map result;
if (*buf == BitmapTypeCode::BITMAP32) {
roaring::Roaring read = roaring::Roaring::read(buf + 1);
bool is_v1 = BitmapTypeCode::BITMAP32 == *buf || BitmapTypeCode::BITMAP64 == *buf;
bool is_bitmap32 = BitmapTypeCode::BITMAP32 == *buf || BitmapTypeCode::BITMAP32_V2 == *buf;
bool is_bitmap64 = BitmapTypeCode::BITMAP64 == *buf || BitmapTypeCode::BITMAP64_V2 == *buf;
if (is_bitmap32) {
roaring::Roaring read = roaring::Roaring::read(buf + 1, is_v1);
result.emplaceOrInsert(0, std::move(read));
return result;
}
DCHECK_EQ(BitmapTypeCode::BITMAP64, *buf);
DCHECK(is_bitmap64);
buf++;
// get map size (varint64 took 1~10 bytes)
@ -755,9 +768,9 @@ public:
uint32_t key = decode_fixed32_le(reinterpret_cast<const uint8_t*>(buf));
buf += sizeof(uint32_t);
// read map value Roaring
roaring::Roaring read_var = roaring::Roaring::read(buf);
roaring::Roaring read_var = roaring::Roaring::read(buf, is_v1);
// forward buffer past the last Roaring Bitmap
buf += read_var.getSizeInBytes();
buf += read_var.getSizeInBytes(is_v1);
result.emplaceOrInsert(key, std::move(read_var));
}
return result;
@ -766,14 +779,15 @@ public:
/**
* How many bytes are required to serialize this bitmap
*/
size_t getSizeInBytes() const {
size_t getSizeInBytes(int serialize_version) const {
bool is_v1 = serialize_version == 1;
if (is32BitsEnough()) {
auto it = roarings.find(0);
if (it == roarings.end()) { // empty bitmap
roaring::Roaring r;
return r.getSizeInBytes() + 1;
return r.getSizeInBytes(is_v1) + 1;
}
return it->second.getSizeInBytes() + 1;
return it->second.getSizeInBytes(is_v1) + 1;
}
// start with type code, map size and size of keys for each map entry
size_t init = 1 + varint_length(roarings.size()) + roarings.size() * sizeof(uint32_t);
@ -781,7 +795,7 @@ public:
roarings.cbegin(), roarings.cend(), init,
[=](size_t previous, const std::pair<const uint32_t, roaring::Roaring>& map_entry) {
// add in bytes used by each Roaring
return previous + map_entry.second.getSizeInBytes();
return previous + map_entry.second.getSizeInBytes(is_v1);
});
}
@ -1314,10 +1328,11 @@ public:
case SET:
return BitmapTypeCode::SET;
case BITMAP:
bool is_v1 = (config::bitmap_serialize_version == 1);
if (_bitmap->is32BitsEnough()) {
return BitmapTypeCode::BITMAP32;
return is_v1 ? BitmapTypeCode::type::BITMAP32 : BitmapTypeCode::type::BITMAP32_V2;
} else {
return BitmapTypeCode::BITMAP64;
return is_v1 ? BitmapTypeCode::type::BITMAP64 : BitmapTypeCode::type::BITMAP64_V2;
}
}
}
@ -2167,7 +2182,7 @@ public:
case BITMAP:
_bitmap->runOptimize();
_bitmap->shrinkToFit();
res = _bitmap->getSizeInBytes();
res = _bitmap->getSizeInBytes(config::bitmap_serialize_version);
break;
case SET:
/// 1 byte for type, 1 byte for count
@ -2205,7 +2220,7 @@ public:
}
break;
case BITMAP:
_bitmap->write(dst);
_bitmap->write(dst, config::bitmap_serialize_version);
break;
}
}
@ -2235,6 +2250,8 @@ public:
break;
case BitmapTypeCode::BITMAP32:
case BitmapTypeCode::BITMAP64:
case BitmapTypeCode::BITMAP32_V2:
case BitmapTypeCode::BITMAP64_V2:
_type = BITMAP;
_prepare_bitmap_for_write();
*_bitmap = detail::Roaring64Map::read(src);
@ -2260,6 +2277,34 @@ public:
}
break;
}
case BitmapTypeCode::SET_V2: {
uint32_t size = 0;
memcpy(&size, src + 1, sizeof(uint32_t));
src += sizeof(uint32_t) + 1;
if (!config::enable_set_in_bitmap_value || size > SET_TYPE_THRESHOLD) {
_type = BITMAP;
_prepare_bitmap_for_write();
for (int i = 0; i < size; ++i) {
uint64_t key {};
memcpy(&key, src, sizeof(uint64_t));
_bitmap->add(key);
src += sizeof(uint64_t);
}
} else {
_type = SET;
_set.reserve(size);
for (int i = 0; i < size; ++i) {
uint64_t key {};
memcpy(&key, src, sizeof(uint64_t));
_set.insert(key);
src += sizeof(uint64_t);
}
}
break;
}
default:
LOG(ERROR) << "BitmapTypeCode invalid, should between: " << BitmapTypeCode::EMPTY
<< " and " << BitmapTypeCode::BITMAP64 << " actual is "

View File

@ -266,9 +266,9 @@ TEST(BitmapValueTest, Roaring64Map) {
}
EXPECT_TRUE(r1.contains((uint64_t)14000000000000000500ull));
EXPECT_EQ(1800, r1.cardinality());
size_t size_before = r1.getSizeInBytes();
size_t size_before = r1.getSizeInBytes(1);
r1.runOptimize();
size_t size_after = r1.getSizeInBytes();
size_t size_after = r1.getSizeInBytes(1);
EXPECT_LT(size_after, size_before);
Roaring64Map r2 = Roaring64Map::bitmapOf(5, 1ull, 2ull, 234294967296ull, 195839473298ull,
@ -311,9 +311,9 @@ TEST(BitmapValueTest, Roaring64Map) {
EXPECT_EQ(1, i1_2.cardinality());
// we can write a bitmap to a pointer and recover it later
uint32_t expectedsize = r1.getSizeInBytes();
uint32_t expectedsize = r1.getSizeInBytes(1);
char* serializedbytes = new char[expectedsize];
r1.write(serializedbytes);
r1.write(serializedbytes, 1);
Roaring64Map t = Roaring64Map::read(serializedbytes);
EXPECT_TRUE(r1 == t);
delete[] serializedbytes;

View File

@ -93,6 +93,11 @@ TEST(function_bitmap_test, function_bitmap_to_base64) {
config::Register::_s_field_map->insert(
std::make_pair(std::string("enable_set_in_bitmap_value"), field));
config::Register::Field field_ser_ver("int16_t", "bitmap_serialize_version",
&config::bitmap_serialize_version, "1", false);
config::Register::_s_field_map->insert(
std::make_pair(std::string("bitmap_serialize_version"), field_ser_ver));
std::string func_name = "bitmap_to_base64";
InputTypeSet input_types = {TypeIndex::BitMap};
@ -123,19 +128,21 @@ TEST(function_bitmap_test, function_bitmap_to_base64) {
EXPECT_EQ(bitmap64_2.get_type_code(), BitmapTypeCode::BITMAP64);
EXPECT_EQ(bitmap64_3.get_type_code(), BitmapTypeCode::BITMAP64);
DataSet data_set = {
{{&bitmap32_1}, std::string("AQEAAAA=")},
{{&bitmap32_2}, std::string("AjowAAACAAAAAAAAAJgAAAAYAAAAGgAAAAEAf5Y=")},
{{&bitmap32_3}, std::string("AjswAAABAAAgAAEAAAAgAA==")},
{{&bitmap64_1}, std::string("AwAAAAABAAAA")},
{{&bitmap64_2},
std::string("BAIAAAAAOjAAAAEAAAAAAAAAEAAAAAEAAQAAADowAAABAAAAAAAAABAAAAAAAA==")},
{{&bitmap64_3},
std::string("BAIAAAAAOzAAAAEAAB8AAQAAAB8AAQAAADowAAABAAAAAAAAABAAAAAAAA==")},
{{&empty_bitmap}, std::string("AA==")},
{{Null()}, Null()}};
{
DataSet data_set = {
{{&bitmap32_1}, std::string("AQEAAAA=")},
{{&bitmap32_2}, std::string("AjowAAACAAAAAAAAAJgAAAAYAAAAGgAAAAEAf5Y=")},
{{&bitmap32_3}, std::string("AjswAAABAAAgAAEAAAAgAA==")},
{{&bitmap64_1}, std::string("AwAAAAABAAAA")},
{{&bitmap64_2},
std::string("BAIAAAAAOjAAAAEAAAAAAAAAEAAAAAEAAQAAADowAAABAAAAAAAAABAAAAAAAA==")},
{{&bitmap64_3},
std::string("BAIAAAAAOzAAAAEAAB8AAQAAAB8AAQAAADowAAABAAAAAAAAABAAAAAAAA==")},
{{&empty_bitmap}, std::string("AA==")},
{{Null()}, Null()}};
check_function<DataTypeString, true>(func_name, input_types, data_set);
check_function<DataTypeString, true>(func_name, input_types, data_set);
}
EXPECT_TRUE(config::set_config("enable_set_in_bitmap_value", "true", false, true).ok());
bitmap32_1 = BitmapValue(1); // single
@ -154,18 +161,46 @@ TEST(function_bitmap_test, function_bitmap_to_base64) {
EXPECT_EQ(bitmap64_2.get_type_code(), BitmapTypeCode::SET);
EXPECT_EQ(bitmap64_3.get_type_code(), BitmapTypeCode::BITMAP64);
DataSet data_set2 = {
{{&bitmap32_1}, std::string("AQEAAAA=")},
{{&bitmap32_2}, std::string("BQIBAAAAAAAAAH+WmAAAAAAA")},
{{&bitmap32_3}, std::string("AjswAAABAAAgAAEAAAAgAA==")},
{{&bitmap64_1}, std::string("AwAAAAABAAAA")},
{{&bitmap64_2}, std::string("BQIAAAAAAQAAAAEAAAAAAAAA")},
{{&bitmap64_3},
std::string("BAIAAAAAOzAAAAEAAB8AAQAAAB8AAQAAADowAAABAAAAAAAAABAAAAAAAA==")},
{{&empty_bitmap}, std::string("AA==")},
{{Null()}, Null()}};
{
DataSet data_set = {
{{&bitmap32_1}, std::string("AQEAAAA=")},
{{&bitmap32_2}, std::string("BQIBAAAAAAAAAH+WmAAAAAAA")},
{{&bitmap32_3}, std::string("AjswAAABAAAgAAEAAAAgAA==")},
{{&bitmap64_1}, std::string("AwAAAAABAAAA")},
{{&bitmap64_2}, std::string("BQIAAAAAAQAAAAEAAAAAAAAA")},
{{&bitmap64_3},
std::string("BAIAAAAAOzAAAAEAAB8AAQAAAB8AAQAAADowAAABAAAAAAAAABAAAAAAAA==")},
{{&empty_bitmap}, std::string("AA==")},
{{Null()}, Null()}};
check_function<DataTypeString, true>(func_name, input_types, data_set2);
check_function<DataTypeString, true>(func_name, input_types, data_set);
}
{
std::string base64("BQQAAAAAAAAAAAEAAAAAAAAAAgAAAAAAAAADAAAAAAAAAA==");
BitmapValue bitmap;
bitmap.add(0);
bitmap.add(1);
bitmap.add(2);
bitmap.add(3);
DataSet data_set = {{{&bitmap}, base64}};
check_function<DataTypeString, true>(func_name, input_types, data_set);
}
// test bitmap serialize version2
EXPECT_TRUE(config::set_config("bitmap_serialize_version", "2", false, true).ok());
bitmap32_3 = BitmapValue(bits32); // bitmap32
bitmap64_3 = BitmapValue(bits64); // bitmap64
EXPECT_EQ(bitmap32_3.get_type_code(), BitmapTypeCode::BITMAP32_V2);
EXPECT_EQ(bitmap64_3.get_type_code(), BitmapTypeCode::BITMAP64_V2);
{
DataSet data_set = {
{{&bitmap32_3}, std::string("DAI7MAAAAQAAIAABAAAAIAA=")},
{{&bitmap64_3}, std::string("DQIAAAAAAjswAAABAAAfAAEAAAAfAAEAAAABAQAAAAAAAAA=")}};
check_function<DataTypeString, true>(func_name, input_types, data_set);
}
}
TEST(function_bitmap_test, function_bitmap_from_base64) {
@ -174,6 +209,11 @@ TEST(function_bitmap_test, function_bitmap_from_base64) {
config::Register::_s_field_map->insert(
std::make_pair(std::string("enable_set_in_bitmap_value"), field));
config::Register::Field field_ser_ver("int16_t", "bitmap_serialize_version",
&config::bitmap_serialize_version, "1", false);
config::Register::_s_field_map->insert(
std::make_pair(std::string("bitmap_serialize_version"), field_ser_ver));
std::string func_name = "bitmap_from_base64";
InputTypeSet input_types = {TypeIndex::String};
@ -205,47 +245,65 @@ TEST(function_bitmap_test, function_bitmap_from_base64) {
BitmapValue bitmap64_3(bits64); // bitmap
BitmapValue empty_bitmap;
DataSet data_set = {{{bitmap32_base64_1}, bitmap32_1}, {{bitmap32_base64_2}, bitmap32_2},
{{bitmap32_base64_3}, bitmap32_3}, {{bitmap64_base64_1}, bitmap64_1},
{{bitmap64_base64_2}, bitmap64_2}, {{bitmap64_base64_3}, bitmap64_3},
{{base64_empty}, empty_bitmap}, {{Null()}, Null()}};
{
DataSet data_set = {{{bitmap32_base64_1}, bitmap32_1}, {{bitmap32_base64_2}, bitmap32_2},
{{bitmap32_base64_3}, bitmap32_3}, {{bitmap64_base64_1}, bitmap64_1},
{{bitmap64_base64_2}, bitmap64_2}, {{bitmap64_base64_3}, bitmap64_3},
{{base64_empty}, empty_bitmap}, {{Null()}, Null()}};
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
}
EXPECT_TRUE(config::set_config("enable_set_in_bitmap_value", "true", false, true).ok());
bitmap32_base64_1 = ("AQEAAAA=");
bitmap32_base64_2 = ("BQIBAAAAAAAAAH");
bitmap32_base64_2 = ("BQIBAAAAAAAAAH+WmAAAAAAA");
bitmap32_base64_3 = ("AjswAAABAAAgAAEAAAAgAA==");
bitmap64_base64_1 = ("AwAAAAABAAAA");
bitmap64_base64_2 = ("BQIAAAAAAQAAAAEAAAAAAAAA");
bitmap64_base64_3 = ("BAIAAAAAOzAAAAEAAB8AAQAAAB8AAQAAADowAAABAAAAAAAAABAAAAAAAA==");
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
{
DataSet data_set = {{{bitmap32_base64_1}, bitmap32_1}, {{bitmap32_base64_2}, bitmap32_2},
{{bitmap32_base64_3}, bitmap32_3}, {{bitmap64_base64_1}, bitmap64_1},
{{bitmap64_base64_2}, bitmap64_2}, {{bitmap64_base64_3}, bitmap64_3},
{{base64_empty}, empty_bitmap}, {{Null()}, Null()}};
/* sr
mysql [(none)]>select bitmap_to_base64(bitmap_from_string("0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32"));
+----------------------------------------------------------------------------------------------------------------------------------+
| bitmap_to_base64(bitmap_from_string('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32')) |
+----------------------------------------------------------------------------------------------------------------------------------+
| AjowAAABAAAAAAAgABAAAAAAAAEAAgADAAQABQAGAAcACAAJAAoACwAMAA0ADgAPABAAEQASABMAFAAVABYAFwAYABkAGgAbABwAHQAeAB8AIAA= |
+----------------------------------------------------------------------------------------------------------------------------------+
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
}
mysql [(none)]>select bitmap_to_base64(bitmap_from_string("0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,4294967296"));
+--------------------------------------------------------------------------------------------------------------------------------------------------+
| bitmap_to_base64(bitmap_from_string('0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,4294967296')) |
+--------------------------------------------------------------------------------------------------------------------------------------------------+
| BAIAAAAAOjAAAAEAAAAAAB8AEAAAAAAAAQACAAMABAAFAAYABwAIAAkACgALAAwADQAOAA8AEAARABIAEwAUABUAFgAXABgAGQAaABsAHAAdAB4AHwABAAAAOjAAAAEAAAAAAAAAEAAAAAAA |
+--------------------------------------------------------------------------------------------------------------------------------------------------+
*/
bitmap32_base64_3 =
("AjowAAABAAAAAAAgABAAAAAAAAEAAgADAAQABQAGAAcACAAJAAoACwAMAA0ADgAPABAAEQASABMAFAAVABYAF"
"wAYABkAGgAbABwAHQAeAB8AIAA=");
bitmap64_base64_3 =
("BAIAAAAAOjAAAAEAAAAAAB8AEAAAAAAAAQACAAMABAAFAAYABwAIAAkACgALAAwADQAOAA8AEAARABIAEwAUA"
"BUAFgAXABgAGQAaABsAHAAdAB4AHwABAAAAOjAAAAEAAAAAAAAAEAAAAAAA");
data_set = {{{bitmap32_base64_3}, bitmap32_3}, {{bitmap64_base64_3}, bitmap64_3}};
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
{
std::string base64("CgIAAAAAAAAAAAAAAAEAAAAAAAAA");
BitmapValue bitmap;
bitmap.add(0);
bitmap.add(1);
DataSet data_set = {{{base64}, bitmap}};
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
}
{
EXPECT_TRUE(config::set_config("bitmap_serialize_version", "1", false, true).ok());
std::string base64_32_v1(
"AjowAAABAAAAAAAgABAAAAAAAAEAAgADAAQABQAGAAcACAAJAAoACwAMAA0ADgAPABAAEQASABMAFAAVAB"
"YAFwAYABkAGgAbABwAHQAeAB8AIAA=");
std::string base64_64_v1(
"BAIAAAAAOjAAAAEAAAAAAB8AEAAAAAAAAQACAAMABAAFAAYABwAIAAkACgALAAwADQAOAA8AEAARABIAEw"
"AUABUAFgAXABgAGQAaABsAHAAdAB4AHwABAAAAOjAAAAEAAAAAAAAAEAAAAAAA");
DataSet data_set = {{{base64_32_v1}, bitmap32_3}, {{base64_64_v1}, bitmap64_3}};
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
}
{
EXPECT_TRUE(config::set_config("bitmap_serialize_version", "2", false, true).ok());
std::string base64_32_v2(
"DAI6MAAAAQAAAAAAIAAQAAAAAAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQ"
"AWABcAGAAZABoAGwAcAB0AHgAfACAA");
std::string base64_64_v2(
"DQIAAAAAAjowAAABAAAAAAAfABAAAAAAAAEAAgADAAQABQAGAAcACAAJAAoACwAMAA0ADgAPABAAEQASAB"
"MAFAAVABYAFwAYABkAGgAbABwAHQAeAB8AAQAAAAEBAAAAAAAAAA==");
DataSet data_set = {{{base64_32_v2}, bitmap32_3}, {{base64_64_v2}, bitmap64_3}};
check_function<DataTypeBitMap, true>(func_name, input_types, data_set);
}
}
TEST(function_bitmap_test, function_bitmap_and_count) {