Currently, there are some useless includes in the codebase. We can use a tool named include-what-you-use to optimize these includes. By using a strict include-what-you-use policy, we can get lots of benefits from it.
460 lines
17 KiB
C++
460 lines
17 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
//
|
|
// This file is copied from
|
|
// https://github.com/apache/kudu/blob/master/src/kudu/tablet/rowset_tree-test.cc
|
|
// and modified by Doris
|
|
|
|
#include "olap/rowset/rowset_tree.h"
|
|
|
|
#include <gen_cpp/olap_file.pb.h>
|
|
#include <glog/logging.h>
|
|
#include <gtest/gtest-message.h>
|
|
#include <gtest/gtest-param-test.h>
|
|
#include <gtest/gtest-test-part.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <unordered_set>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "gtest/gtest_pred_impl.h"
|
|
#include "gutil/map-util.h"
|
|
#include "gutil/stringprintf.h"
|
|
#include "gutil/strings/substitute.h"
|
|
#include "olap/rowset/rowset.h"
|
|
#include "olap/rowset/rowset_meta.h"
|
|
#include "olap/rowset/unique_rowset_id_generator.h"
|
|
#include "olap/tablet_schema.h"
|
|
#include "testutil/mock_rowset.h"
|
|
#include "testutil/test_util.h"
|
|
#include "util/slice.h"
|
|
#include "util/stopwatch.hpp"
|
|
|
|
using std::make_shared;
|
|
using std::shared_ptr;
|
|
using std::string;
|
|
using std::unordered_set;
|
|
using std::vector;
|
|
using strings::Substitute;
|
|
|
|
namespace doris {
|
|
|
|
class TestRowsetTree : public testing::Test {
|
|
public:
|
|
TestRowsetTree() : rowset_id_generator_({0, 0}) {}
|
|
|
|
void SetUp() {
|
|
schema_ = std::make_shared<TabletSchema>();
|
|
TabletSchemaPB schema_pb;
|
|
schema_pb.set_keys_type(UNIQUE_KEYS);
|
|
schema_->init_from_pb(schema_pb);
|
|
}
|
|
|
|
// Generates random rowsets with keys between 0 and 10000
|
|
RowsetVector GenerateRandomRowsets(int num_sets) {
|
|
RowsetVector vec;
|
|
for (int i = 0; i < num_sets; i++) {
|
|
int min = rand() % 9000;
|
|
int max = min + 1000;
|
|
vec.push_back(create_rowset(StringPrintf("%04d", min), StringPrintf("%04d", max)));
|
|
}
|
|
return vec;
|
|
}
|
|
|
|
RowsetSharedPtr create_rowset(const string& min_key, const string& max_key,
|
|
bool is_mem_rowset = false) {
|
|
RowsetMetaPB rs_meta_pb;
|
|
rs_meta_pb.set_rowset_id_v2(rowset_id_generator_.next_id().to_string());
|
|
rs_meta_pb.set_num_segments(1);
|
|
KeyBoundsPB key_bounds;
|
|
key_bounds.set_min_key(min_key);
|
|
key_bounds.set_max_key(max_key);
|
|
KeyBoundsPB* new_key_bounds = rs_meta_pb.add_segments_key_bounds();
|
|
*new_key_bounds = key_bounds;
|
|
RowsetMetaSharedPtr meta_ptr = make_shared<RowsetMeta>();
|
|
meta_ptr->init_from_pb(rs_meta_pb);
|
|
RowsetSharedPtr res_ptr;
|
|
MockRowset::create_rowset(schema_, rowset_path_, meta_ptr, &res_ptr, is_mem_rowset);
|
|
return res_ptr;
|
|
}
|
|
|
|
private:
|
|
TabletSchemaSPtr schema_;
|
|
std::string rowset_path_;
|
|
UniqueRowsetIdGenerator rowset_id_generator_;
|
|
};
|
|
|
|
TEST_F(TestRowsetTree, TestTree) {
|
|
RowsetIdUnorderedSet rowset_ids;
|
|
RowsetVector vec;
|
|
auto rowset1 = create_rowset("0", "5");
|
|
vec.push_back(rowset1);
|
|
rowset_ids.insert(rowset1->rowset_id());
|
|
|
|
auto rowset2 = create_rowset("3", "5");
|
|
vec.push_back(rowset2);
|
|
rowset_ids.insert(rowset2->rowset_id());
|
|
|
|
auto rowset3 = create_rowset("5", "9");
|
|
vec.push_back(rowset3);
|
|
rowset_ids.insert(rowset3->rowset_id());
|
|
|
|
auto rowset4 = create_rowset("0", "0", true);
|
|
vec.push_back(rowset4);
|
|
rowset_ids.insert(rowset4->rowset_id());
|
|
|
|
RowsetTree tree;
|
|
ASSERT_FALSE(tree.Init(vec).ok());
|
|
|
|
vec.erase(vec.begin() + 3);
|
|
ASSERT_TRUE(tree.Init(vec).ok());
|
|
|
|
// "2" overlaps 0-5
|
|
vector<std::pair<RowsetSharedPtr, int32_t>> out;
|
|
tree.FindRowsetsWithKeyInRange("2", &rowset_ids, &out);
|
|
ASSERT_EQ(1, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
|
|
// "4" overlaps 0-5, 3-5
|
|
out.clear();
|
|
tree.FindRowsetsWithKeyInRange("4", &rowset_ids, &out);
|
|
ASSERT_EQ(2, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
|
|
// interval [3,4) overlaps 0-5, 3-5
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("3"), Slice("4"), &out);
|
|
ASSERT_EQ(2, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
|
|
// interval [0,2) overlaps 0-5
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("0"), Slice("2"), &out);
|
|
ASSERT_EQ(1, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
|
|
// interval [5,7) overlaps 0-5, 3-5, 5-9
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("5"), Slice("7"), &out);
|
|
ASSERT_EQ(3, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
ASSERT_EQ(vec[2].get(), out[2].first.get());
|
|
|
|
// "3" overlaps 0-5, 3-5
|
|
out.clear();
|
|
tree.FindRowsetsWithKeyInRange("3", &rowset_ids, &out);
|
|
ASSERT_EQ(2, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
|
|
// "5" overlaps 0-5, 3-5, 5-9
|
|
out.clear();
|
|
tree.FindRowsetsWithKeyInRange("5", &rowset_ids, &out);
|
|
ASSERT_EQ(3, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
ASSERT_EQ(vec[2].get(), out[2].first.get());
|
|
|
|
// interval [0,5) overlaps 0-5, 3-5
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("0"), Slice("5"), &out);
|
|
ASSERT_EQ(2, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
|
|
// interval [3,5) overlaps 0-5, 3-5
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("3"), Slice("5"), &out);
|
|
ASSERT_EQ(2, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
|
|
// interval [-OO,3) overlaps 0-5
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(std::nullopt, Slice("3"), &out);
|
|
ASSERT_EQ(1, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
|
|
// interval [-OO,5) overlaps 0-5, 3-5
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(std::nullopt, Slice("5"), &out);
|
|
ASSERT_EQ(2, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
|
|
// interval [-OO,99) overlaps 0-5, 3-5, 5-9
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(std::nullopt, Slice("99"), &out);
|
|
ASSERT_EQ(3, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
ASSERT_EQ(vec[2].get(), out[2].first.get());
|
|
|
|
// interval [6,+OO) overlaps 5-9
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("6"), std::nullopt, &out);
|
|
ASSERT_EQ(1, out.size());
|
|
ASSERT_EQ(vec[2].get(), out[0].first.get());
|
|
|
|
// interval [5,+OO) overlaps 0-5, 3-5, 5-9
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("5"), std::nullopt, &out);
|
|
ASSERT_EQ(3, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
ASSERT_EQ(vec[2].get(), out[2].first.get());
|
|
|
|
// interval [4,+OO) overlaps 0-5, 3-5, 5-9
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(Slice("4"), std::nullopt, &out);
|
|
ASSERT_EQ(3, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
ASSERT_EQ(vec[2].get(), out[2].first.get());
|
|
|
|
// interval [-OO,+OO) overlaps 0-5, 3-5, 5-9
|
|
out.clear();
|
|
tree.FindRowsetsIntersectingInterval(std::nullopt, std::nullopt, &out);
|
|
ASSERT_EQ(3, out.size());
|
|
ASSERT_EQ(vec[0].get(), out[0].first.get());
|
|
ASSERT_EQ(vec[1].get(), out[1].first.get());
|
|
ASSERT_EQ(vec[2].get(), out[2].first.get());
|
|
}
|
|
|
|
TEST_F(TestRowsetTree, TestTreeRandomized) {
|
|
enum BoundOperator {
|
|
BOUND_LESS_THAN,
|
|
BOUND_LESS_EQUAL,
|
|
BOUND_GREATER_THAN,
|
|
BOUND_GREATER_EQUAL,
|
|
BOUND_EQUAL
|
|
};
|
|
const auto& GetStringPair = [](const BoundOperator op, int start, int range_length) {
|
|
while (true) {
|
|
string s1 = Substitute("$0", rand_rng_int(start, start + range_length));
|
|
string s2 = Substitute("$0", rand_rng_int(start, start + range_length));
|
|
int r = strcmp(s1.c_str(), s2.c_str());
|
|
switch (op) {
|
|
case BOUND_LESS_THAN:
|
|
if (r == 0) continue;
|
|
[[fallthrough]];
|
|
case BOUND_LESS_EQUAL:
|
|
return std::pair<string, string>(std::min(s1, s2), std::max(s1, s2));
|
|
case BOUND_GREATER_THAN:
|
|
if (r == 0) continue;
|
|
[[fallthrough]];
|
|
case BOUND_GREATER_EQUAL:
|
|
return std::pair<string, string>(std::max(s1, s2), std::min(s1, s2));
|
|
case BOUND_EQUAL:
|
|
return std::pair<string, string>(s1, s1);
|
|
}
|
|
}
|
|
};
|
|
|
|
RowsetVector vec;
|
|
for (int i = 0; i < 100; i++) {
|
|
std::pair<string, string> bound = GetStringPair(BOUND_LESS_EQUAL, 1000, 900);
|
|
ASSERT_LE(bound.first, bound.second);
|
|
vec.push_back(shared_ptr<Rowset>(create_rowset(bound.first, bound.second)));
|
|
}
|
|
RowsetTree tree;
|
|
ASSERT_TRUE(tree.Init(vec).ok());
|
|
|
|
// When lower < upper.
|
|
vector<std::pair<RowsetSharedPtr, int32_t>> out;
|
|
for (int i = 0; i < 100; i++) {
|
|
out.clear();
|
|
std::pair<string, string> bound = GetStringPair(BOUND_LESS_THAN, 1000, 900);
|
|
ASSERT_LT(bound.first, bound.second);
|
|
tree.FindRowsetsIntersectingInterval(Slice(bound.first), Slice(bound.second), &out);
|
|
for (const auto& e : out) {
|
|
std::vector<KeyBoundsPB> segments_key_bounds;
|
|
e.first->get_segments_key_bounds(&segments_key_bounds);
|
|
ASSERT_EQ(1, segments_key_bounds.size());
|
|
string min = segments_key_bounds[0].min_key();
|
|
string max = segments_key_bounds[0].max_key();
|
|
if (min < bound.first) {
|
|
ASSERT_GE(max, bound.first);
|
|
} else {
|
|
ASSERT_LT(min, bound.second);
|
|
}
|
|
if (max >= bound.second) {
|
|
ASSERT_LT(min, bound.second);
|
|
} else {
|
|
ASSERT_GE(max, bound.first);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove 50 rowsets, add 10 new rowsets, with non overlapping key range.
|
|
RowsetVector vec_to_del(vec.begin(), vec.begin() + 50);
|
|
RowsetVector vec_to_add;
|
|
for (int i = 0; i < 10; i++) {
|
|
std::pair<string, string> bound = GetStringPair(BOUND_LESS_EQUAL, 2000, 900);
|
|
ASSERT_LE(bound.first, bound.second);
|
|
vec_to_add.push_back(shared_ptr<Rowset>(create_rowset(bound.first, bound.second)));
|
|
}
|
|
|
|
RowsetTree new_tree;
|
|
ModifyRowSetTree(tree, vec_to_del, vec_to_add, &new_tree);
|
|
|
|
// only 50 rowsets left in old key range "1000"-"1900"
|
|
out.clear();
|
|
new_tree.FindRowsetsIntersectingInterval(Slice("1000"), Slice("1999"), &out);
|
|
ASSERT_EQ(50, out.size());
|
|
// should get 10 new added rowsets with key range "2000"-"2900"
|
|
out.clear();
|
|
new_tree.FindRowsetsIntersectingInterval(Slice("2000"), Slice("2999"), &out);
|
|
ASSERT_EQ(10, out.size());
|
|
out.clear();
|
|
new_tree.FindRowsetsIntersectingInterval(Slice("1000"), Slice("2999"), &out);
|
|
ASSERT_EQ(60, out.size());
|
|
}
|
|
|
|
class TestRowsetTreePerformance : public TestRowsetTree,
|
|
public testing::WithParamInterface<std::tuple<int, int>> {};
|
|
INSTANTIATE_TEST_SUITE_P(Parameters, TestRowsetTreePerformance,
|
|
testing::Combine(
|
|
// Number of rowsets.
|
|
// Up to 500 rowsets (500*32MB = 16GB tablet)
|
|
testing::Values(10, 100, 250, 500),
|
|
// Number of query points in a batch.
|
|
testing::Values(10, 100, 500, 1000, 5000)));
|
|
|
|
TEST_P(TestRowsetTreePerformance, TestPerformance) {
|
|
const int kNumRowsets = std::get<0>(GetParam());
|
|
const int kNumQueries = std::get<1>(GetParam());
|
|
const int kNumIterations = AllowSlowTests() ? 1000 : 10;
|
|
|
|
MonotonicStopWatch one_at_time_timer;
|
|
MonotonicStopWatch batch_timer;
|
|
RowsetIdUnorderedSet rowset_ids;
|
|
for (int i = 0; i < kNumIterations; i++) {
|
|
rowset_ids.clear();
|
|
// Create a bunch of rowsets, each of which spans about 10% of the "row space".
|
|
// The row space here is 4-digit 0-padded numbers.
|
|
RowsetVector vec = GenerateRandomRowsets(kNumRowsets);
|
|
for (auto rowset : vec) {
|
|
rowset_ids.insert(rowset->rowset_id());
|
|
}
|
|
|
|
RowsetTree tree;
|
|
ASSERT_TRUE(tree.Init(vec).ok());
|
|
|
|
vector<string> queries;
|
|
for (int j = 0; j < kNumQueries; j++) {
|
|
int query = rand_rng_int(0, 10000);
|
|
queries.emplace_back(StringPrintf("%04d", query));
|
|
}
|
|
|
|
int individual_matches = 0;
|
|
one_at_time_timer.start();
|
|
{
|
|
vector<std::pair<RowsetSharedPtr, int32_t>> out;
|
|
for (const auto& q : queries) {
|
|
out.clear();
|
|
tree.FindRowsetsWithKeyInRange(Slice(q), &rowset_ids, &out);
|
|
individual_matches += out.size();
|
|
}
|
|
}
|
|
one_at_time_timer.stop();
|
|
|
|
vector<Slice> query_slices;
|
|
for (const auto& q : queries) {
|
|
query_slices.emplace_back(q);
|
|
}
|
|
|
|
batch_timer.start();
|
|
std::sort(query_slices.begin(), query_slices.end(), Slice::Comparator());
|
|
int bulk_matches = 0;
|
|
{
|
|
tree.ForEachRowsetContainingKeys(
|
|
query_slices, [&](RowsetSharedPtr rs, int slice_idx) { bulk_matches++; });
|
|
}
|
|
batch_timer.stop();
|
|
|
|
ASSERT_EQ(bulk_matches, individual_matches);
|
|
}
|
|
|
|
double batch_total = batch_timer.elapsed_time();
|
|
double oat_total = one_at_time_timer.elapsed_time();
|
|
const string& case_desc = StringPrintf("Q=% 5d R=% 5d", kNumQueries, kNumRowsets);
|
|
LOG(INFO) << StringPrintf("%s %10s %d ms", case_desc.c_str(), "1-by-1",
|
|
static_cast<int>(oat_total / 1e6));
|
|
LOG(INFO) << StringPrintf("%s %10s %d ms (%.2fx)", case_desc.c_str(), "batched",
|
|
static_cast<int>(batch_total / 1e6),
|
|
batch_total ? (oat_total / batch_total) : 0);
|
|
}
|
|
|
|
TEST_F(TestRowsetTree, TestEndpointsConsistency) {
|
|
const int kNumRowsets = 1000;
|
|
RowsetVector vec = GenerateRandomRowsets(kNumRowsets);
|
|
// Add pathological one-key rows
|
|
for (int i = 0; i < 10; ++i) {
|
|
vec.push_back(create_rowset(StringPrintf("%04d", 11000), StringPrintf("%04d", 11000)));
|
|
}
|
|
vec.push_back(create_rowset(StringPrintf("%04d", 12000), StringPrintf("%04d", 12000)));
|
|
// Make tree
|
|
RowsetTree tree;
|
|
ASSERT_TRUE(tree.Init(vec).ok());
|
|
// Keep track of "currently open" intervals defined by the endpoints
|
|
unordered_set<RowsetSharedPtr> open;
|
|
// Keep track of all rowsets that have been visited
|
|
unordered_set<RowsetSharedPtr> visited;
|
|
|
|
Slice prev;
|
|
for (const RowsetTree::RSEndpoint& rse : tree.key_endpoints()) {
|
|
RowsetSharedPtr rs = rse.rowset_;
|
|
enum RowsetTree::EndpointType ept = rse.endpoint_;
|
|
const Slice& slice = rse.slice_;
|
|
|
|
ASSERT_TRUE(rs != nullptr) << "RowsetTree has an endpoint with no rowset";
|
|
ASSERT_TRUE(!slice.empty()) << "RowsetTree has an endpoint with no key";
|
|
|
|
if (!prev.empty()) {
|
|
ASSERT_LE(prev.compare(slice), 0);
|
|
}
|
|
|
|
std::vector<KeyBoundsPB> segments_key_bounds;
|
|
ASSERT_TRUE(rs->get_segments_key_bounds(&segments_key_bounds).ok());
|
|
ASSERT_EQ(1, segments_key_bounds.size());
|
|
string min = segments_key_bounds[0].min_key();
|
|
string max = segments_key_bounds[0].max_key();
|
|
if (ept == RowsetTree::START) {
|
|
ASSERT_EQ(min, slice.to_string());
|
|
ASSERT_TRUE(InsertIfNotPresent(&open, rs));
|
|
ASSERT_TRUE(InsertIfNotPresent(&visited, rs));
|
|
} else if (ept == RowsetTree::STOP) {
|
|
ASSERT_EQ(max, slice.to_string());
|
|
ASSERT_TRUE(open.erase(rs) == 1);
|
|
} else {
|
|
FAIL() << "No such endpoint type exists";
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace doris
|