Fixes #2771
Main changes in this CL
* RoaringBitmap is renamed to BitmapValue and moved into bitmap_value.h
* leveraging Roaring64Map to support unsigned BIGINT for BITMAP type
* introduces two new format (SINGLE64 and BITMAP64) for BITMAP type
So far we have three storage format for BITMAP type
```
EMPTY := TypeCode(0x00)
SINGLE32 := TypeCode(0x01), UInt32LittleEndian
BITMAP32 := TypeCode(0x02), RoaringBitmap(defined by https://github.com/RoaringBitmap/RoaringFormatSpec/)
```
In order to support BIGINT element and keep backward compatibility, introduce two new format
```
SINGLE64 := TypeCode(0x03), UInt64LittleEndian
BITMAP64 := TypeCode(0x04), CustomRoaringBitmap64
```
Please note that SINGLE64/BITMAP64 doesn't replace SINGLE32/BITMAP32. Doris will choose the smaller (in terms of space) type automatically during serializing. For example, BITMAP32 is preferred over BITMAP64 when the maximum element is <= UINT32_MAX. This will also make BE rollback possible as long as user didn't write element larger than UINT32_MAX into bitmap column.
Another important design decision is that we fork and maintain our own version of Roaring64Map instead of using the one in "roaring/roaring64map.hh". The reasons are
1. RoaringBitmap doesn't define a standard for the binary format of 64-bits bitmap. As a result, different implementations of Roaring64Map use different format. For example the [C++ version](https://github.com/RoaringBitmap/CRoaring/blob/v0.2.60/cpp/roaring64map.hh#L545) is different from the [Java version](35104c564e/src/main/java/org/roaringbitmap/longlong/Roaring64NavigableMap.java (L1097)). Even for CRoaring, the format may change in future releases. However Doris require the serialized format to be stable across versions. Fork is a safe way to achieve this.
2. We may want to make some code changes to Roaring64Map according to our needs. For example, in order to use the BITMAP32 format when the maximum element can be represented in 32 bits, we may want to access the private member of Roaring64Map. Another example is we want to further customize and optimize the format for BITMAP64 case, such as using vint64 instead of uint64 for map size.
155 lines
4.3 KiB
C++
155 lines
4.3 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "util/bitmap.h"
|
|
|
|
#include <sstream>
|
|
|
|
#include "gutil/stringprintf.h"
|
|
|
|
namespace doris {
|
|
|
|
std::string Bitmap::DebugString(bool print_bits) const {
|
|
int64_t words = BitUtil::round_up(num_bits_, 64) / 64;
|
|
std::stringstream ss;
|
|
ss << "Size (" << num_bits_ << ") words (" << words << ") ";
|
|
if (print_bits) {
|
|
for (int i = 0; i < num_bits(); ++i) {
|
|
if (Get(i)) {
|
|
ss << "1";
|
|
} else {
|
|
ss << "0";
|
|
}
|
|
}
|
|
} else {
|
|
for (auto v : buffer_) {
|
|
ss << v << ".";
|
|
}
|
|
}
|
|
ss << std::endl;
|
|
return ss.str();
|
|
}
|
|
|
|
void BitmapChangeBits(uint8_t *bitmap, size_t offset, size_t num_bits, bool value) {
|
|
DCHECK_GT(num_bits, 0);
|
|
|
|
size_t start_byte = (offset >> 3);
|
|
size_t end_byte = (offset + num_bits - 1) >> 3;
|
|
int single_byte = (start_byte == end_byte);
|
|
|
|
// Change the last bits of the first byte
|
|
size_t left = offset & 0x7;
|
|
size_t right = (single_byte) ? (left + num_bits) : 8;
|
|
uint8_t mask = ((0xff << left) & (0xff >> (8 - right)));
|
|
if (value) {
|
|
bitmap[start_byte++] |= mask;
|
|
} else {
|
|
bitmap[start_byte++] &= ~mask;
|
|
}
|
|
|
|
// Nothing left... I'm done
|
|
if (single_byte) {
|
|
return;
|
|
}
|
|
|
|
// change the middle bits
|
|
if (end_byte > start_byte) {
|
|
const uint8_t pattern8[2] = { 0x00, 0xff };
|
|
memset(bitmap + start_byte, pattern8[value], end_byte - start_byte);
|
|
}
|
|
|
|
// change the first bits of the last byte
|
|
right = offset + num_bits - (end_byte << 3);
|
|
mask = (0xff >> (8 - right));
|
|
if (value) {
|
|
bitmap[end_byte] |= mask;
|
|
} else {
|
|
bitmap[end_byte] &= ~mask;
|
|
}
|
|
}
|
|
|
|
bool BitmapFindFirst(const uint8_t *bitmap, size_t offset, size_t bitmap_size,
|
|
bool value, size_t *idx) {
|
|
const uint64_t pattern64[2] = { 0xffffffffffffffff, 0x0000000000000000 };
|
|
const uint8_t pattern8[2] = { 0xff, 0x00 };
|
|
size_t bit;
|
|
|
|
DCHECK_LE(offset, bitmap_size);
|
|
|
|
// Jump to the byte at specified offset
|
|
const uint8_t *p = bitmap + (offset >> 3);
|
|
size_t num_bits = bitmap_size - offset;
|
|
|
|
// Find a 'value' bit at the end of the first byte
|
|
if ((bit = offset & 0x7)) {
|
|
for (; bit < 8 && num_bits > 0; ++bit) {
|
|
if (BitmapTest(p, bit) == value) {
|
|
*idx = ((p - bitmap) << 3) + bit;
|
|
return true;
|
|
}
|
|
|
|
num_bits--;
|
|
}
|
|
|
|
p++;
|
|
}
|
|
|
|
// check 64bit at the time for a 'value' bit
|
|
const uint64_t *u64 = (const uint64_t *)p;
|
|
while (num_bits >= 64 && *u64 == pattern64[value]) {
|
|
num_bits -= 64;
|
|
u64++;
|
|
}
|
|
|
|
// check 8bit at the time for a 'value' bit
|
|
p = (const uint8_t *)u64;
|
|
while (num_bits >= 8 && *p == pattern8[value]) {
|
|
num_bits -= 8;
|
|
p++;
|
|
}
|
|
|
|
// Find a 'value' bit at the beginning of the last byte
|
|
for (bit = 0; num_bits > 0; ++bit) {
|
|
if (BitmapTest(p, bit) == value) {
|
|
*idx = ((p - bitmap) << 3) + bit;
|
|
return true;
|
|
}
|
|
num_bits--;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
std::string BitmapToString(const uint8_t *bitmap, size_t num_bits) {
|
|
std::string s;
|
|
size_t index = 0;
|
|
while (index < num_bits) {
|
|
StringAppendF(&s, "%4zu: ", index);
|
|
for (int i = 0; i < 8 && index < num_bits; ++i) {
|
|
for (int j = 0; j < 8 && index < num_bits; ++j) {
|
|
StringAppendF(&s, "%d", BitmapTest(bitmap, index));
|
|
index++;
|
|
}
|
|
StringAppendF(&s, " ");
|
|
}
|
|
StringAppendF(&s, "\n");
|
|
}
|
|
return s;
|
|
}
|
|
|
|
}
|