// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include #include #include "common/config.h" #include "env/env_posix.h" #include "gen_cpp/olap_file.pb.h" #include "olap/data_dir.h" #include "olap/row_cursor.h" #include "olap/rowset/beta_rowset_reader.h" #include "olap/rowset/beta_rowset_writer.h" #include "olap/rowset/rowset_factory.h" #include "olap/rowset/rowset_reader_context.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" #include "olap/storage_engine.h" #include "olap/tablet_schema.h" #include "olap/utils.h" #include "runtime/exec_env.h" #include "runtime/mem_pool.h" #include "runtime/memory/mem_tracker.h" #include "util/file_utils.h" #include "util/slice.h" namespace doris { using namespace ErrorCode; static const uint32_t MAX_PATH_LEN = 1024; StorageEngine* l_engine = nullptr; static const std::string lTestDir = "./data_test/data/segcompaction_test"; class SegCompactionTest : public testing::Test { public: SegCompactionTest() : _data_dir(std::make_unique(lTestDir)) { _data_dir->update_capacity(); } void SetUp() { config::enable_segcompaction = true; config::enable_storage_vectorization = true; config::tablet_map_shard_size = 1; config::txn_map_shard_size = 1; config::txn_shard_size = 1; char buffer[MAX_PATH_LEN]; EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); config::storage_root_path = std::string(buffer) + "/data_test"; EXPECT_TRUE(FileUtils::remove_all(config::storage_root_path).ok()); EXPECT_TRUE(FileUtils::create_dir(config::storage_root_path).ok()); std::vector paths; paths.emplace_back(config::storage_root_path, -1); doris::EngineOptions options; options.store_paths = paths; Status s = doris::StorageEngine::open(options, &l_engine); EXPECT_TRUE(s.ok()) << s.to_string(); ExecEnv* exec_env = doris::ExecEnv::GetInstance(); exec_env->set_storage_engine(l_engine); EXPECT_TRUE(FileUtils::create_dir(lTestDir).ok()); l_engine->start_bg_threads(); } void TearDown() { if (l_engine != nullptr) { l_engine->stop(); delete l_engine; l_engine = nullptr; } config::enable_segcompaction = false; } protected: OlapReaderStatistics _stats; bool check_dir(std::vector& vec) { std::vector result; for (const auto& entry : std::filesystem::directory_iterator(lTestDir)) { result.push_back(std::filesystem::path(entry.path()).filename()); } LOG(INFO) << "expected ls:" << std::endl; for (auto& i : vec) { LOG(INFO) << i; } LOG(INFO) << "acutal ls:" << std::endl; for (auto& i : result) { LOG(INFO) << i; } if (result.size() != vec.size()) { return false; } else { for (auto& i : vec) { if (std::find(result.begin(), result.end(), i) == result.end()) { return false; } } } return true; } // (k1 int, k2 varchar(20), k3 int) duplicated key (k1, k2) void create_tablet_schema(TabletSchemaSPtr tablet_schema) { TabletSchemaPB tablet_schema_pb; tablet_schema_pb.set_keys_type(DUP_KEYS); tablet_schema_pb.set_num_short_key_columns(2); tablet_schema_pb.set_num_rows_per_row_block(1024); tablet_schema_pb.set_compress_kind(COMPRESS_NONE); tablet_schema_pb.set_next_column_unique_id(4); ColumnPB* column_1 = tablet_schema_pb.add_column(); column_1->set_unique_id(1); column_1->set_name("k1"); column_1->set_type("INT"); column_1->set_is_key(true); column_1->set_length(4); column_1->set_index_length(4); column_1->set_is_nullable(true); column_1->set_is_bf_column(false); ColumnPB* column_2 = tablet_schema_pb.add_column(); column_2->set_unique_id(2); column_2->set_name("k2"); column_2->set_type( "INT"); // TODO change to varchar(20) when dict encoding for string is supported column_2->set_length(4); column_2->set_index_length(4); column_2->set_is_nullable(true); column_2->set_is_key(true); column_2->set_is_nullable(true); column_2->set_is_bf_column(false); ColumnPB* column_3 = tablet_schema_pb.add_column(); column_3->set_unique_id(3); column_3->set_name("v1"); column_3->set_type("INT"); column_3->set_length(4); column_3->set_is_key(false); column_3->set_is_nullable(false); column_3->set_is_bf_column(false); column_3->set_aggregation("SUM"); tablet_schema->init_from_pb(tablet_schema_pb); } // use different id to avoid conflict void create_rowset_writer_context(int64_t id, TabletSchemaSPtr tablet_schema, RowsetWriterContext* rowset_writer_context) { RowsetId rowset_id; rowset_id.init(id); // rowset_writer_context->data_dir = _data_dir.get(); rowset_writer_context->rowset_id = rowset_id; rowset_writer_context->tablet_id = 12345; rowset_writer_context->tablet_schema_hash = 1111; rowset_writer_context->partition_id = 10; rowset_writer_context->rowset_type = BETA_ROWSET; rowset_writer_context->rowset_dir = lTestDir; rowset_writer_context->rowset_state = VISIBLE; rowset_writer_context->tablet_schema = tablet_schema; rowset_writer_context->version.first = 10; rowset_writer_context->version.second = 10; } void create_and_init_rowset_reader(Rowset* rowset, RowsetReaderContext& context, RowsetReaderSharedPtr* result) { auto s = rowset->create_reader(result); EXPECT_EQ(Status::OK(), s); EXPECT_TRUE(*result != nullptr); s = (*result)->init(&context); EXPECT_EQ(Status::OK(), s); } private: std::unique_ptr _data_dir; }; TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { config::enable_segcompaction = true; config::enable_storage_vectorization = true; Status s; TabletSchemaSPtr tablet_schema = std::make_shared(); create_tablet_schema(tablet_schema); RowsetSharedPtr rowset; config::segcompaction_small_threshold = 6000; // set threshold above // rows_per_segment std::vector segment_num_rows; { // write `num_segments * rows_per_segment` rows to rowset RowsetWriterContext writer_context; create_rowset_writer_context(10048, tablet_schema, &writer_context); std::unique_ptr rowset_writer; s = RowsetFactory::create_rowset_writer(writer_context, false, &rowset_writer); EXPECT_EQ(Status::OK(), s); RowCursor input_row; input_row.init(tablet_schema); // for segment "i", row "rid" // k1 := rid*10 + i // k2 := k1 * 10 // k3 := 4096 * i + rid int num_segments = 4; uint32_t rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 2; rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 1; rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 1; rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 8; rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); sleep(1); } num_segments = 1; rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } rowset = rowset_writer->build(); std::vector ls; // ooooOOoOooooooooO ls.push_back("10048_0.dat"); // oooo ls.push_back("10048_1.dat"); // O ls.push_back("10048_2.dat"); // O ls.push_back("10048_3.dat"); // o ls.push_back("10048_4.dat"); // O ls.push_back("10048_5.dat"); // oooooooo ls.push_back("10048_6.dat"); // O EXPECT_TRUE(check_dir(ls)); } } TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { config::enable_segcompaction = true; config::enable_storage_vectorization = true; Status s; TabletSchemaSPtr tablet_schema = std::make_shared(); create_tablet_schema(tablet_schema); RowsetSharedPtr rowset; config::segcompaction_small_threshold = 6000; // set threshold above config::segcompaction_threshold_segment_num = 5; std::vector segment_num_rows; { // write `num_segments * rows_per_segment` rows to rowset RowsetWriterContext writer_context; create_rowset_writer_context(10049, tablet_schema, &writer_context); std::unique_ptr rowset_writer; s = RowsetFactory::create_rowset_writer(writer_context, false, &rowset_writer); EXPECT_EQ(Status::OK(), s); RowCursor input_row; input_row.init(tablet_schema); // for segment "i", row "rid" // k1 := rid*10 + i // k2 := k1 * 10 // k3 := 4096 * i + rid int num_segments = 1; uint32_t rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 1; rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 1; rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 1; rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } num_segments = 1; rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { MemPool mem_pool; for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; uint32_t k3 = rid; input_row.set_field_content(0, reinterpret_cast(&k1), &mem_pool); input_row.set_field_content(1, reinterpret_cast(&k2), &mem_pool); input_row.set_field_content(2, reinterpret_cast(&k3), &mem_pool); s = rowset_writer->add_row(input_row); EXPECT_EQ(Status::OK(), s); } s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); } rowset = rowset_writer->build(); std::vector ls; ls.push_back("10049_0.dat"); // O ls.push_back("10049_1.dat"); // o ls.push_back("10049_2.dat"); // O ls.push_back("10049_3.dat"); // o ls.push_back("10049_4.dat"); // O EXPECT_TRUE(check_dir(ls)); } } } // namespace doris // @brief Test Stub