Files
doris/be/test/util/bitmap_value_test.cpp
Dayue Gao 3b24287251 Support 64 bits integers for BITMAP type (#2772)
Fixes #2771 

Main changes in this CL
* RoaringBitmap is renamed to BitmapValue and moved into bitmap_value.h
* leveraging Roaring64Map to support unsigned BIGINT for BITMAP type
* introduces two new format (SINGLE64 and BITMAP64) for BITMAP type

So far we have three storage format for BITMAP type

```
EMPTY := TypeCode(0x00)
SINGLE32 := TypeCode(0x01), UInt32LittleEndian
BITMAP32 := TypeCode(0x02), RoaringBitmap(defined by https://github.com/RoaringBitmap/RoaringFormatSpec/)
```

In order to support BIGINT element and keep backward compatibility, introduce two new format

```
SINGLE64 := TypeCode(0x03), UInt64LittleEndian
BITMAP64 := TypeCode(0x04), CustomRoaringBitmap64
```

Please note that SINGLE64/BITMAP64 doesn't replace SINGLE32/BITMAP32. Doris will choose the smaller (in terms of space) type automatically during serializing. For example, BITMAP32 is preferred over BITMAP64 when the maximum element is <= UINT32_MAX. This will also make BE rollback possible as long as user didn't write element larger than UINT32_MAX into bitmap column.

Another important design decision is that we fork and maintain our own version of Roaring64Map instead of using the one in "roaring/roaring64map.hh". The reasons are

1. RoaringBitmap doesn't define a standard for the binary format of 64-bits bitmap. As a result, different implementations of Roaring64Map use different format. For example the [C++ version](https://github.com/RoaringBitmap/CRoaring/blob/v0.2.60/cpp/roaring64map.hh#L545) is different from the [Java version](35104c564e/src/main/java/org/roaringbitmap/longlong/Roaring64NavigableMap.java (L1097)). Even for CRoaring, the format may change in future releases. However Doris require the serialized format to be stable across versions. Fork is a safe way to achieve this.
2. We may want to make some code changes to Roaring64Map according to our needs. For example, in order to use the BITMAP32 format when the maximum element can be represented in 32 bits, we may want to access the private member of Roaring64Map. Another example is we want to further customize and optimize the format for BITMAP64 case, such as using vint64 instead of uint64 for map size.
2020-01-17 14:13:38 +08:00

312 lines
9.3 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "util/bitmap_value.h"
#include <cstdint>
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "util/coding.h"
namespace doris {
TEST(BitmapValueTest, bitmap_union) {
BitmapValue empty;
BitmapValue single(1024);
BitmapValue bitmap({1024, 1025, 1026});
ASSERT_EQ(0, empty.cardinality());
ASSERT_EQ(1, single.cardinality());
ASSERT_EQ(3, bitmap.cardinality());
BitmapValue empty2;
empty2 |= empty;
ASSERT_EQ(0, empty2.cardinality());
empty2 |= single;
ASSERT_EQ(1, empty2.cardinality());
BitmapValue empty3;
empty3 |= bitmap;
ASSERT_EQ(3, empty3.cardinality());
BitmapValue single2(1025);
single2 |= empty;
ASSERT_EQ(1, single2.cardinality());
single2 |= single;
ASSERT_EQ(2, single2.cardinality());
BitmapValue single3(1027);
single3 |= bitmap;
ASSERT_EQ(4, single3.cardinality());
BitmapValue bitmap2;
bitmap2.add(1024);
bitmap2.add(2048);
bitmap2.add(4096);
bitmap2 |= empty;
ASSERT_EQ(3, bitmap2.cardinality());
bitmap2 |= single;
ASSERT_EQ(3, bitmap2.cardinality());
bitmap2 |= bitmap;
ASSERT_EQ(5, bitmap2.cardinality());
}
TEST(BitmapValueTest, bitmap_intersect) {
BitmapValue empty;
BitmapValue single(1024);
BitmapValue bitmap({1024, 1025, 1026});
BitmapValue empty2;
empty2 &= empty;
ASSERT_EQ(0, empty2.cardinality());
empty2 &= single;
ASSERT_EQ(0, empty2.cardinality());
empty2 &= bitmap;
ASSERT_EQ(0, empty2.cardinality());
BitmapValue single2(1025);
single2 &= empty;
ASSERT_EQ(0, single2.cardinality());
BitmapValue single4(1025);
single4 &= single;
ASSERT_EQ(0, single4.cardinality());
BitmapValue single3(1024);
single3 &= single;
ASSERT_EQ(1, single3.cardinality());
single3 &= bitmap;
ASSERT_EQ(1, single3.cardinality());
BitmapValue single5(2048);
single5 &= bitmap;
ASSERT_EQ(0, single5.cardinality());
BitmapValue bitmap2;
bitmap2.add(1024);
bitmap2.add(2048);
bitmap2 &= empty;
ASSERT_EQ(0, bitmap2.cardinality());
BitmapValue bitmap3;
bitmap3.add(1024);
bitmap3.add(2048);
bitmap3 &= single;
ASSERT_EQ(1, bitmap3.cardinality());
BitmapValue bitmap4;
bitmap4.add(2049);
bitmap4.add(2048);
bitmap4 &= single;
ASSERT_EQ(0, bitmap4.cardinality());
BitmapValue bitmap5;
bitmap5.add(2049);
bitmap5.add(2048);
bitmap5 &= bitmap;
ASSERT_EQ(0, bitmap5.cardinality());
BitmapValue bitmap6;
bitmap6.add(1024);
bitmap6.add(1025);
bitmap6 &= bitmap;
ASSERT_EQ(2, bitmap6.cardinality());
}
std::string convert_bitmap_to_string(BitmapValue& bitmap) {
std::string buf;
buf.resize(bitmap.getSizeInBytes());
bitmap.write((char*)buf.c_str());
return buf;
}
TEST(BitmapValueTest, bitmap_serde) {
{ // EMPTY
BitmapValue empty;
std::string buffer = convert_bitmap_to_string(empty);
std::string expect_buffer(1, BitmapTypeCode::EMPTY);
ASSERT_EQ(expect_buffer, buffer);
BitmapValue out(buffer.data());
ASSERT_EQ(0, out.cardinality());
}
{ // SINGLE32
uint32_t i = UINT32_MAX;
BitmapValue single32(i);
std::string buffer = convert_bitmap_to_string(single32);
std::string expect_buffer(1, BitmapTypeCode::SINGLE32);
put_fixed32_le(&expect_buffer, i);
ASSERT_EQ(expect_buffer, buffer);
BitmapValue out(buffer.data());
ASSERT_EQ(1, out.cardinality());
ASSERT_TRUE(out.contains(i));
}
{ // BITMAP32
BitmapValue bitmap32({ 0, UINT32_MAX });
std::string buffer = convert_bitmap_to_string(bitmap32);
Roaring roaring;
roaring.add(0);
roaring.add(UINT32_MAX);
std::string expect_buffer(1, BitmapTypeCode::BITMAP32);
expect_buffer.resize(1 + roaring.getSizeInBytes());
roaring.write(&expect_buffer[1]);
ASSERT_EQ(expect_buffer, buffer);
BitmapValue out(buffer.data());
ASSERT_EQ(2, out.cardinality());
ASSERT_TRUE(out.contains(0));
ASSERT_TRUE(out.contains(UINT32_MAX));
}
{ // SINGLE64
uint64_t i = static_cast<uint64_t>(UINT32_MAX) + 1;
BitmapValue single64(i);
std::string buffer = convert_bitmap_to_string(single64);
std::string expect_buffer(1, BitmapTypeCode::SINGLE64);
put_fixed64_le(&expect_buffer, i);
ASSERT_EQ(expect_buffer, buffer);
BitmapValue out(buffer.data());
ASSERT_EQ(1, out.cardinality());
ASSERT_TRUE(out.contains(i));
}
{ // BITMAP64
BitmapValue bitmap64({ 0, static_cast<uint64_t>(UINT32_MAX) + 1 });
std::string buffer = convert_bitmap_to_string(bitmap64);
Roaring roaring;
roaring.add(0);
std::string expect_buffer(1, BitmapTypeCode::BITMAP64);
put_varint64(&expect_buffer, 2); // map size
for (uint32_t i = 0; i < 2; ++i) {
std::string map_entry;
put_fixed32_le(&map_entry, i); // map key
map_entry.resize(sizeof(uint32_t) + roaring.getSizeInBytes());
roaring.write(&map_entry[4]); // map value
expect_buffer.append(map_entry);
}
ASSERT_EQ(expect_buffer, buffer);
BitmapValue out(buffer.data());
ASSERT_EQ(2, out.cardinality());
ASSERT_TRUE(out.contains(0));
ASSERT_TRUE(out.contains(static_cast<uint64_t>(UINT32_MAX) + 1));
}
}
// Forked from CRoaring's UT of Roaring64Map
TEST(BitmapValueTest, Roaring64Map) {
using doris::detail::Roaring64Map;
// create a new empty bitmap
Roaring64Map r1;
uint64_t r1_sum = 0;
// then we can add values
for (uint64_t i = 100; i < 1000; i++) {
r1.add(i);
r1_sum += i;
}
for (uint64_t i = 14000000000000000100ull; i < 14000000000000001000ull; i++) {
r1.add(i);
r1_sum += i;
}
ASSERT_TRUE(r1.contains((uint64_t)14000000000000000500ull));
ASSERT_EQ(1800, r1.cardinality());
size_t size_before = r1.getSizeInBytes();
r1.runOptimize();
size_t size_after = r1.getSizeInBytes();
ASSERT_LT(size_after, size_before);
Roaring64Map r2 = Roaring64Map::bitmapOf(5, 1ull, 2ull, 234294967296ull,
195839473298ull, 14000000000000000100ull);
ASSERT_EQ(1ull, r2.minimum());
ASSERT_EQ(14000000000000000100ull, r2.maximum());
ASSERT_EQ(4ull, r2.rank(234294967296ull));
// we can also create a bitmap from a pointer to 32-bit integers
const uint32_t values[] = {2, 3, 4};
Roaring64Map r3(3, values);
ASSERT_EQ(3, r3.cardinality());
// we can also go in reverse and go from arrays to bitmaps
uint64_t card1 = r1.cardinality();
uint64_t* arr1 = new uint64_t[card1];
ASSERT_TRUE(arr1 != nullptr);
r1.toUint64Array(arr1);
Roaring64Map r1f(card1, arr1);
delete[] arr1;
// bitmaps shall be equal
ASSERT_TRUE(r1 == r1f);
// we can copy and compare bitmaps
Roaring64Map z(r3);
ASSERT_TRUE(r3 == z);
// we can compute union two-by-two
Roaring64Map r1_2_3 = r1 | r2;
r1_2_3 |= r3;
// we can compute a big union
const Roaring64Map *allmybitmaps[] = {&r1, &r2, &r3};
Roaring64Map bigunion = Roaring64Map::fastunion(3, allmybitmaps);
ASSERT_TRUE(r1_2_3 == bigunion);
ASSERT_EQ(1806, r1_2_3.cardinality());
// we can compute intersection two-by-two
Roaring64Map i1_2 = r1 & r2;
ASSERT_EQ(1, i1_2.cardinality());
// we can write a bitmap to a pointer and recover it later
uint32_t expectedsize = r1.getSizeInBytes();
char* serializedbytes = new char[expectedsize];
r1.write(serializedbytes);
Roaring64Map t = Roaring64Map::read(serializedbytes);
ASSERT_TRUE(r1 == t);
delete[] serializedbytes;
// we can iterate over all values using custom functions
uint64_t sum = 0;
auto func = [](uint64_t value, void* param) {
*(uint64_t*)param += value;
return true; // we always process all values
};
r1.iterate(func, &sum);
ASSERT_EQ(r1_sum, sum);
// we can also iterate the C++ way
sum = 0;
for (Roaring64Map::const_iterator i = t.begin() ; i != t.end() ; i++) {
sum += *i;
}
ASSERT_EQ(r1_sum, sum);
}
TEST(BitmapValueTest, bitmap_to_string) {
BitmapValue empty;
ASSERT_STREQ("", empty.to_string().c_str());
empty.add(1);
ASSERT_STREQ("1", empty.to_string().c_str());
empty.add(2);
ASSERT_STREQ("1,2", empty.to_string().c_str());
}
} // namespace doris
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}