[Fix](partition) Skip rowset partition id eq 0 smaller than config wh… (#29510)

This commit is contained in:
deardeng
2024-01-05 19:39:51 +08:00
committed by GitHub
parent 7402fee1fc
commit f40cce1406
10 changed files with 74 additions and 15 deletions

View File

@ -1162,6 +1162,9 @@ DEFINE_mInt64(local_exchange_buffer_mem_limit, "134217728");
// Default 300s, if its value <= 0, then log is disabled
DEFINE_mInt64(enable_debug_log_timeout_secs, "0");
// Tolerance for the number of partition id 0 in rowset, default 0
DEFINE_Int32(ignore_invalid_partition_id_rowset_num, "0");
// clang-format off
#ifdef BE_TEST
// test s3

View File

@ -1237,6 +1237,9 @@ DECLARE_mInt64(enable_debug_log_timeout_secs);
DECLARE_mBool(enable_column_type_check);
// Tolerance for the number of partition id 0 in rowset, default 0
DECLARE_Int32(ignore_invalid_partition_id_rowset_num);
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);

View File

@ -409,6 +409,12 @@ Status DataDir::load() {
RETURN_IF_ERROR(_meta->put(META_COLUMN_FAMILY_INDEX, key, result));
}
}
if (rowset_meta->partition_id() == 0) {
LOG(WARNING) << "rs tablet=" << rowset_meta->tablet_id() << " rowset_id=" << rowset_id
<< " load from meta but partition id eq 0";
}
dir_rowset_metas.push_back(rowset_meta);
return true;
};
@ -497,6 +503,19 @@ Status DataDir::load() {
RETURN_IF_ERROR(
TabletMetaManager::traverse_pending_publish(_meta, load_pending_publish_info_func));
int64_t rowset_partition_id_eq_0_num = 0;
for (auto rowset_meta : dir_rowset_metas) {
if (rowset_meta->partition_id() == 0) {
++rowset_partition_id_eq_0_num;
}
}
if (rowset_partition_id_eq_0_num > config::ignore_invalid_partition_id_rowset_num) {
LOG(FATAL) << fmt::format(
"roswet partition id eq 0 bigger than config {}, be exit, plz check be.INFO",
config::ignore_invalid_partition_id_rowset_num);
exit(-1);
}
// traverse rowset
// 1. add committed rowset to txn map
// 2. add visible rowset to tablet
@ -513,6 +532,13 @@ Status DataDir::load() {
continue;
}
if (rowset_meta->partition_id() == 0) {
LOG(WARNING) << "skip tablet_id=" << tablet->tablet_id()
<< " rowset: " << rowset_meta->rowset_id()
<< " txn: " << rowset_meta->txn_id();
continue;
}
RowsetSharedPtr rowset;
Status create_status = tablet->create_rowset(rowset_meta, &rowset);
if (!create_status) {
@ -528,7 +554,7 @@ Status DataDir::load() {
rowset_meta->set_tablet_schema(tablet->tablet_schema());
RETURN_IF_ERROR(RowsetMetaManager::save(_meta, rowset_meta->tablet_uid(),
rowset_meta->rowset_id(),
rowset_meta->get_rowset_pb()));
rowset_meta->get_rowset_pb(), false));
}
Status commit_txn_status = _txn_manager->commit_txn(
_meta, rowset_meta->partition_id(), rowset_meta->txn_id(),
@ -561,7 +587,7 @@ Status DataDir::load() {
rowset_meta->set_tablet_schema(tablet->tablet_schema());
RETURN_IF_ERROR(RowsetMetaManager::save(_meta, rowset_meta->tablet_uid(),
rowset_meta->rowset_id(),
rowset_meta->get_rowset_pb()));
rowset_meta->get_rowset_pb(), false));
}
Status publish_status = tablet->add_rowset(rowset);
if (!publish_status && !publish_status.is<PUSH_VERSION_ALREADY_EXIST>()) {

View File

@ -34,6 +34,7 @@
#include "olap/olap_define.h"
#include "olap/olap_meta.h"
#include "olap/utils.h"
#include "util/debug_points.h"
namespace doris {
@ -95,15 +96,22 @@ Status RowsetMetaManager::save(OlapMeta* meta, TabletUid tablet_uid, const Rowse
// return Status::InternalError("invaid partition id {} tablet {}",
// rowset_meta_pb.partition_id(), rowset_meta_pb.tablet_id());
}
DBUG_EXECUTE_IF("RowsetMetaManager::save::zero_partition_id", {
long partition_id = rowset_meta_pb.partition_id();
auto& rs_pb = const_cast<std::decay_t<decltype(rowset_meta_pb)>&>(rowset_meta_pb);
rs_pb.set_partition_id(0);
LOG(WARNING) << "set debug point RowsetMetaManager::save::zero_partition_id old="
<< partition_id << " new=" << rowset_meta_pb.DebugString();
});
if (enable_binlog) {
return _save_with_binlog(meta, tablet_uid, rowset_id, rowset_meta_pb);
} else {
return save(meta, tablet_uid, rowset_id, rowset_meta_pb);
return _save(meta, tablet_uid, rowset_id, rowset_meta_pb);
}
}
Status RowsetMetaManager::save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id,
const RowsetMetaPB& rowset_meta_pb) {
Status RowsetMetaManager::_save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id,
const RowsetMetaPB& rowset_meta_pb) {
std::string key =
fmt::format("{}{}_{}", ROWSET_PREFIX, tablet_uid.to_string(), rowset_id.to_string());
std::string value;
@ -523,7 +531,7 @@ Status RowsetMetaManager::load_json_rowset_meta(OlapMeta* meta,
}
RowsetId rowset_id = rowset_meta.rowset_id();
TabletUid tablet_uid = rowset_meta.tablet_uid();
Status status = save(meta, tablet_uid, rowset_id, rowset_meta.get_rowset_pb());
Status status = save(meta, tablet_uid, rowset_id, rowset_meta.get_rowset_pb(), false);
return status;
}

View File

@ -54,8 +54,6 @@ public:
// TODO(Drogon): refactor save && _save_with_binlog to one, adapt to ut temperately
static Status save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id,
const RowsetMetaPB& rowset_meta_pb, bool enable_binlog);
static Status save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id,
const RowsetMetaPB& rowset_meta_pb);
static std::vector<std::string> get_binlog_filenames(OlapMeta* meta, TabletUid tablet_uid,
std::string_view binlog_version,
@ -83,6 +81,8 @@ public:
static Status load_json_rowset_meta(OlapMeta* meta, const std::string& rowset_meta_path);
private:
static Status _save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id,
const RowsetMetaPB& rowset_meta_pb);
static Status _save_with_binlog(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id,
const RowsetMetaPB& rowset_meta_pb);
static Status _get_rowset_binlog_metas(OlapMeta* meta, const TabletUid tablet_uid,

View File

@ -832,10 +832,10 @@ Status TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tablet_
tablet_meta->set_tablet_state(TABLET_RUNNING);
}
if (tablet_meta->partition_id() <= 0) {
LOG(WARNING) << "invalid partition id " << tablet_meta->partition_id() << ", tablet "
<< tablet_meta->tablet_id();
if (tablet_meta->partition_id() == 0) {
LOG(WARNING) << "tablet=" << tablet_id << " load from meta but partition id eq 0";
}
TabletSharedPtr tablet = std::make_shared<Tablet>(std::move(tablet_meta), data_dir);
// NOTE: method load_tablet_from_meta could be called by two cases as below

View File

@ -37,6 +37,7 @@
#include "olap/olap_define.h"
#include "olap/tablet_meta_manager.h"
#include "olap/utils.h"
#include "util/debug_points.h"
#include "util/string_util.h"
#include "util/time.h"
#include "util/uid_util.h"
@ -468,6 +469,16 @@ Status TabletMeta::_save_meta(DataDir* data_dir) {
Status TabletMeta::serialize(string* meta_binary) {
TabletMetaPB tablet_meta_pb;
to_meta_pb(&tablet_meta_pb);
if (tablet_meta_pb.partition_id() <= 0) {
LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
<< tablet_meta_pb.tablet_id();
}
DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
long partition_id = tablet_meta_pb.partition_id();
tablet_meta_pb.set_partition_id(0);
LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
<< partition_id << " new=" << tablet_meta_pb.DebugString();
});
bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
if (!serialize_success) {
LOG(FATAL) << "failed to serialize meta " << tablet_id();

View File

@ -327,6 +327,13 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id,
do {
// get tx
std::shared_lock rdlock(_get_txn_map_lock(transaction_id));
auto rs_pb = rowset_ptr->rowset_meta()->get_rowset_pb();
// TODO(dx): remove log after fix partition id eq 0 bug
if (!rs_pb.has_partition_id() || rs_pb.partition_id() == 0) {
rowset_ptr->rowset_meta()->set_partition_id(partition_id);
LOG(WARNING) << "cant get partition id from rs pb, get from func arg partition_id="
<< partition_id;
}
txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id);
auto it = txn_tablet_map.find(key);
if (it == txn_tablet_map.end()) {
@ -374,8 +381,9 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id,
// save meta need access disk, it maybe very slow, so that it is not in global txn lock
// it is under a single txn lock
if (!is_recovery) {
Status save_status = RowsetMetaManager::save(meta, tablet_uid, rowset_ptr->rowset_id(),
rowset_ptr->rowset_meta()->get_rowset_pb());
Status save_status =
RowsetMetaManager::save(meta, tablet_uid, rowset_ptr->rowset_id(),
rowset_ptr->rowset_meta()->get_rowset_pb(), false);
DBUG_EXECUTE_IF("TxnManager.RowsetMetaManager.save_wait", {
if (auto wait = dp->param<int>("duration", 0); wait > 0) {
LOG_WARNING("TxnManager.RowsetMetaManager.save_wait").tag("wait ms", wait);

View File

@ -176,7 +176,7 @@ TEST(PathGcTest, GcTabletAndRowset) {
st = create_rowset_files(*rs, false);
ASSERT_TRUE(st.ok()) << st;
st = RowsetMetaManager::save(data_dir.get_meta(), rs->rowset_meta()->tablet_uid(),
rs->rowset_id(), rs->rowset_meta()->get_rowset_pb());
rs->rowset_id(), rs->rowset_meta()->get_rowset_pb(), false);
ASSERT_TRUE(st.ok()) << st;
}
// Prepare garbage rowset files

View File

@ -106,7 +106,7 @@ TEST_F(RowsetMetaManagerTest, TestSaveAndGetAndRemove) {
EXPECT_EQ(rowset_meta.rowset_id(), rowset_id);
RowsetMetaPB rowset_meta_pb;
rowset_meta.to_rowset_pb(&rowset_meta_pb);
Status status = RowsetMetaManager::save(_meta, _tablet_uid, rowset_id, rowset_meta_pb);
Status status = RowsetMetaManager::save(_meta, _tablet_uid, rowset_id, rowset_meta_pb, false);
EXPECT_TRUE(status == Status::OK());
EXPECT_TRUE(RowsetMetaManager::check_rowset_meta(_meta, _tablet_uid, rowset_id));
std::string json_rowset_meta_read;