Files
doris/be/test/util/rle_encoding_test.cpp
ZHAO Chun 024348d74b Enable auto convert when check in (#1926)
Leverage gitattributes to enable auto convert end-of-line to LF when
checking in. Convert already exist CRLF to LF by removing all files and
checking out with new .gitattributes file. Except .gitattributes, all
files are only modified at the end of line.
2019-10-09 22:31:27 +08:00

427 lines
12 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ostream>
#include <string>
#include <vector>
#include <limits>
// Must come before gtest.h.
#include <boost/utility/binary.hpp>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "util/bit_stream_utils.h"
#include "util/bit_stream_utils.inline.h"
#include "util/bit_util.h"
#include "util/faststring.h"
#include "util/rle_encoding.h"
#include "util/debug_util.h"
using std::string;
using std::vector;
namespace doris {
const int kMaxWidth = 64;
class TestRle : public testing::Test {};
// Validates encoding of values by encoding and decoding them. If
// expected_encoding != NULL, also validates that the encoded buffer is
// exactly 'expected_encoding'.
// if expected_len is not -1, it will validate the encoded size is correct.
template<typename T>
void ValidateRle(const vector<T>& values, int bit_width,
uint8_t* expected_encoding, int expected_len) {
faststring buffer;
RleEncoder<T> encoder(&buffer, bit_width);
for (const auto& value : values) {
encoder.Put(value);
}
int encoded_len = encoder.Flush();
if (expected_len != -1) {
EXPECT_EQ(encoded_len, expected_len);
}
if (expected_encoding != nullptr) {
EXPECT_EQ(memcmp(buffer.data(), expected_encoding, expected_len), 0)
<< "\n"
<< "Expected: " << hexdump((const char*)expected_encoding, expected_len) << "\n"
<< "Got: " << hexdump((const char*)buffer.data(), buffer.size());
}
// Verify read
RleDecoder<T> decoder(buffer.data(), encoded_len, bit_width);
for (const auto& value : values) {
T val = 0;
bool result = decoder.Get(&val);
EXPECT_TRUE(result);
EXPECT_EQ(value, val);
}
}
TEST(Rle, SpecificSequences) {
const int kTestLen = 1024;
uint8_t expected_buffer[kTestLen];
vector<uint64_t> values;
// Test 50 0' followed by 50 1's
values.resize(100);
for (int i = 0; i < 50; ++i) {
values[i] = 0;
}
for (int i = 50; i < 100; ++i) {
values[i] = 1;
}
// expected_buffer valid for bit width <= 1 byte
expected_buffer[0] = (50 << 1);
expected_buffer[1] = 0;
expected_buffer[2] = (50 << 1);
expected_buffer[3] = 1;
for (int width = 1; width <= 8; ++width) {
ValidateRle(values, width, expected_buffer, 4);
}
for (int width = 9; width <= kMaxWidth; ++width) {
ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8)));
}
// Test 100 0's and 1's alternating
for (int i = 0; i < 100; ++i) {
values[i] = i % 2;
}
int num_groups = BitUtil::Ceil(100, 8);
expected_buffer[0] = (num_groups << 1) | 1;
for (int i = 0; i < 100/8; ++i) {
expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa
}
// Values for the last 4 0 and 1's
expected_buffer[1 + 100/8] = BOOST_BINARY(0 0 0 0 1 0 1 0); // 0x0a
// num_groups and expected_buffer only valid for bit width = 1
ValidateRle(values, 1, expected_buffer, 1 + num_groups);
for (int width = 2; width <= kMaxWidth; ++width) {
ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8));
}
}
// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
// is used, otherwise alternating values are used.
void TestRleValues(int bit_width, int num_vals, int value = -1) {
const uint64_t mod = bit_width == 64 ? 1ULL : 1ULL << bit_width;
vector<uint64_t> values;
for (uint64_t v = 0; v < num_vals; ++v) {
values.push_back((value != -1) ? value : (bit_width == 64 ? v : (v % mod)));
}
ValidateRle(values, bit_width, nullptr, -1);
}
TEST(Rle, TestValues) {
for (int width = 1; width <= kMaxWidth; ++width) {
TestRleValues(width, 1);
TestRleValues(width, 1024);
TestRleValues(width, 1024, 0);
TestRleValues(width, 1024, 1);
}
}
class BitRle : public testing::Test {
public:
BitRle() {
}
virtual ~BitRle() {
}
};
// Tests all true/false values
TEST_F(BitRle, AllSame) {
const int kTestLen = 1024;
vector<bool> values;
for (int v = 0; v < 2; ++v) {
values.clear();
for (int i = 0; i < kTestLen; ++i) {
values.push_back(v ? true : false);
}
ValidateRle(values, 1, nullptr, 3);
}
}
// Test that writes out a repeated group and then a literal
// group but flush before finishing.
TEST_F(BitRle, Flush) {
vector<bool> values;
for (int i = 0; i < 16; ++i) values.push_back(1);
values.push_back(false);
ValidateRle(values, 1, nullptr, -1);
values.push_back(true);
ValidateRle(values, 1, nullptr, -1);
values.push_back(true);
ValidateRle(values, 1, nullptr, -1);
values.push_back(true);
ValidateRle(values, 1, nullptr, -1);
}
// Test some random bool sequences.
TEST_F(BitRle, RandomBools) {
int iters = 0;
const int n_iters = 20;
while (iters < n_iters) {
srand(iters++);
if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters;
vector<uint64_t > values;
bool parity = 0;
for (int i = 0; i < 1000; ++i) {
int group_size = rand() % 20 + 1; // NOLINT(*)
if (group_size > 16) {
group_size = 1;
}
for (int i = 0; i < group_size; ++i) {
values.push_back(parity);
}
parity = !parity;
}
ValidateRle(values, (iters % kMaxWidth) + 1, nullptr, -1);
}
}
// Test some random 64-bit sequences.
TEST_F(BitRle, Random64Bit) {
int iters = 0;
const int n_iters = 20;
while (iters < n_iters) {
srand(iters++);
if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters;
vector<uint64_t > values;
for (int i = 0; i < 1000; ++i) {
int group_size = rand() % 20 + 1; // NOLINT(*)
uint64_t cur_value = (static_cast<uint64_t>(rand()) << 32) + static_cast<uint64_t>(rand());
if (group_size > 16) {
group_size = 1;
}
for (int i = 0; i < group_size; ++i) {
values.push_back(cur_value);
}
}
ValidateRle(values, 64, nullptr, -1);
}
}
// Test a sequence of 1 0's, 2 1's, 3 0's. etc
// e.g. 011000111100000
TEST_F(BitRle, RepeatedPattern) {
vector<bool> values;
const int min_run = 1;
const int max_run = 32;
for (int i = min_run; i <= max_run; ++i) {
int v = i % 2;
for (int j = 0; j < i; ++j) {
values.push_back(v);
}
}
// And go back down again
for (int i = max_run; i >= min_run; --i) {
int v = i % 2;
for (int j = 0; j < i; ++j) {
values.push_back(v);
}
}
ValidateRle(values, 1, nullptr, -1);
}
TEST_F(TestRle, TestBulkPut) {
size_t run_length;
bool val = false;
faststring buffer(1);
RleEncoder<bool> encoder(&buffer, 1);
encoder.Put(true, 10);
encoder.Put(false, 7);
encoder.Put(true, 5);
encoder.Put(true, 15);
encoder.Flush();
RleDecoder<bool> decoder(buffer.data(), encoder.len(), 1);
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_TRUE(val);
ASSERT_EQ(10, run_length);
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_FALSE(val);
ASSERT_EQ(7, run_length);
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_TRUE(val);
ASSERT_EQ(20, run_length);
ASSERT_EQ(0, decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max()));
}
TEST_F(TestRle, TestGetNextRun) {
// Repeat the test with different number of items
for (int num_items = 7; num_items < 200; num_items += 13) {
// Test different block patterns
// 1: 01010101 01010101
// 2: 00110011 00110011
// 3: 00011100 01110001
// ...
for (int block = 1; block <= 20; ++block) {
faststring buffer(1);
RleEncoder<bool> encoder(&buffer, 1);
for (int j = 0; j < num_items; ++j) {
encoder.Put(!!(j & 1), block);
}
encoder.Flush();
RleDecoder<bool> decoder(buffer.data(), encoder.len(), 1);
size_t count = num_items * block;
for (int j = 0; j < num_items; ++j) {
size_t run_length;
bool val = false;
DCHECK_GT(count, 0);
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
run_length = std::min(run_length, count);
ASSERT_EQ(!!(j & 1), val);
ASSERT_EQ(block, run_length);
count -= run_length;
}
DCHECK_EQ(count, 0);
}
}
}
// Generate a random bit string which consists of 'num_runs' runs,
// each with a random length between 1 and 100. Returns the number
// of values encoded (i.e the sum run length).
static size_t GenerateRandomBitString(int num_runs, faststring* enc_buf, string* string_rep) {
RleEncoder<bool> enc(enc_buf, 1);
int num_bits = 0;
for (int i = 0; i < num_runs; i++) {
int run_length = random() % 100;
bool value = static_cast<bool>(i & 1);
enc.Put(value, run_length);
string_rep->append(run_length, value ? '1' : '0');
num_bits += run_length;
}
enc.Flush();
return num_bits;
}
TEST_F(TestRle, TestRoundTripRandomSequencesWithRuns) {
srand(time(nullptr));
// Test the limiting function of GetNextRun.
const int kMaxToReadAtOnce = (random() % 20) + 1;
// Generate a bunch of random bit sequences, and "round-trip" them
// through the encode/decode sequence.
for (int rep = 0; rep < 100; rep++) {
faststring buf;
string string_rep;
int num_bits = GenerateRandomBitString(10, &buf, &string_rep);
RleDecoder<bool> decoder(buf.data(), buf.size(), 1);
string roundtrip_str;
int rem_to_read = num_bits;
size_t run_len;
bool val;
while (rem_to_read > 0 &&
(run_len = decoder.GetNextRun(&val, std::min(kMaxToReadAtOnce, rem_to_read))) != 0) {
ASSERT_LE(run_len, kMaxToReadAtOnce);
roundtrip_str.append(run_len, val ? '1' : '0');
rem_to_read -= run_len;
}
ASSERT_EQ(string_rep, roundtrip_str);
}
}
TEST_F(TestRle, TestSkip) {
faststring buffer(1);
RleEncoder<bool> encoder(&buffer, 1);
// 0101010[1] 01010101 01
// "A"
for (int j = 0; j < 18; ++j) {
encoder.Put(!!(j & 1));
}
// 0011[00] 11001100 11001100 11001100 11001100
// "B"
for (int j = 0; j < 19; ++j) {
encoder.Put(!!(j & 1), 2);
}
// 000000000000 11[1111111111] 000000000000 111111111111
// "C"
// 000000000000 111111111111 0[00000000000] 111111111111
// "D"
// 000000000000 111111111111 000000000000 111111111111
for (int j = 0; j < 12; ++j) {
encoder.Put(!!(j & 1), 12);
}
encoder.Flush();
bool val = false;
size_t run_length;
RleDecoder<bool> decoder(buffer.data(), encoder.len(), 1);
// position before "A"
ASSERT_EQ(3, decoder.Skip(7));
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_TRUE(val);
ASSERT_EQ(1, run_length);
// position before "B"
ASSERT_EQ(7, decoder.Skip(14));
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_FALSE(val);
ASSERT_EQ(2, run_length);
// position before "C"
ASSERT_EQ(18, decoder.Skip(46));
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_TRUE(val);
ASSERT_EQ(10, run_length);
// position before "D"
ASSERT_EQ(24, decoder.Skip(49));
run_length = decoder.GetNextRun(&val, std::numeric_limits<std::size_t>::max());
ASSERT_FALSE(val);
ASSERT_EQ(11, run_length);
encoder.Flush();
}
} // namespace doris
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}