diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index fab834639e..fe3291e45a 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1162,6 +1162,9 @@ DEFINE_mInt64(local_exchange_buffer_mem_limit, "134217728"); // Default 300s, if its value <= 0, then log is disabled DEFINE_mInt64(enable_debug_log_timeout_secs, "0"); +// Tolerance for the number of partition id 0 in rowset, default 0 +DEFINE_Int32(ignore_invalid_partition_id_rowset_num, "0"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 9065baae53..25444346a1 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1237,6 +1237,9 @@ DECLARE_mInt64(enable_debug_log_timeout_secs); DECLARE_mBool(enable_column_type_check); +// Tolerance for the number of partition id 0 in rowset, default 0 +DECLARE_Int32(ignore_invalid_partition_id_rowset_num); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 662596bfba..61469ef055 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -409,6 +409,12 @@ Status DataDir::load() { RETURN_IF_ERROR(_meta->put(META_COLUMN_FAMILY_INDEX, key, result)); } } + + if (rowset_meta->partition_id() == 0) { + LOG(WARNING) << "rs tablet=" << rowset_meta->tablet_id() << " rowset_id=" << rowset_id + << " load from meta but partition id eq 0"; + } + dir_rowset_metas.push_back(rowset_meta); return true; }; @@ -497,6 +503,19 @@ Status DataDir::load() { RETURN_IF_ERROR( TabletMetaManager::traverse_pending_publish(_meta, load_pending_publish_info_func)); + int64_t rowset_partition_id_eq_0_num = 0; + for (auto rowset_meta : dir_rowset_metas) { + if (rowset_meta->partition_id() == 0) { + ++rowset_partition_id_eq_0_num; + } + } + if (rowset_partition_id_eq_0_num > config::ignore_invalid_partition_id_rowset_num) { + LOG(FATAL) << fmt::format( + "roswet partition id eq 0 bigger than config {}, be exit, plz check be.INFO", + config::ignore_invalid_partition_id_rowset_num); + exit(-1); + } + // traverse rowset // 1. add committed rowset to txn map // 2. add visible rowset to tablet @@ -513,6 +532,13 @@ Status DataDir::load() { continue; } + if (rowset_meta->partition_id() == 0) { + LOG(WARNING) << "skip tablet_id=" << tablet->tablet_id() + << " rowset: " << rowset_meta->rowset_id() + << " txn: " << rowset_meta->txn_id(); + continue; + } + RowsetSharedPtr rowset; Status create_status = tablet->create_rowset(rowset_meta, &rowset); if (!create_status) { @@ -528,7 +554,7 @@ Status DataDir::load() { rowset_meta->set_tablet_schema(tablet->tablet_schema()); RETURN_IF_ERROR(RowsetMetaManager::save(_meta, rowset_meta->tablet_uid(), rowset_meta->rowset_id(), - rowset_meta->get_rowset_pb())); + rowset_meta->get_rowset_pb(), false)); } Status commit_txn_status = _txn_manager->commit_txn( _meta, rowset_meta->partition_id(), rowset_meta->txn_id(), @@ -561,7 +587,7 @@ Status DataDir::load() { rowset_meta->set_tablet_schema(tablet->tablet_schema()); RETURN_IF_ERROR(RowsetMetaManager::save(_meta, rowset_meta->tablet_uid(), rowset_meta->rowset_id(), - rowset_meta->get_rowset_pb())); + rowset_meta->get_rowset_pb(), false)); } Status publish_status = tablet->add_rowset(rowset); if (!publish_status && !publish_status.is()) { diff --git a/be/src/olap/rowset/rowset_meta_manager.cpp b/be/src/olap/rowset/rowset_meta_manager.cpp index be49e66ee1..38911327d8 100644 --- a/be/src/olap/rowset/rowset_meta_manager.cpp +++ b/be/src/olap/rowset/rowset_meta_manager.cpp @@ -34,6 +34,7 @@ #include "olap/olap_define.h" #include "olap/olap_meta.h" #include "olap/utils.h" +#include "util/debug_points.h" namespace doris { @@ -95,15 +96,22 @@ Status RowsetMetaManager::save(OlapMeta* meta, TabletUid tablet_uid, const Rowse // return Status::InternalError("invaid partition id {} tablet {}", // rowset_meta_pb.partition_id(), rowset_meta_pb.tablet_id()); } + DBUG_EXECUTE_IF("RowsetMetaManager::save::zero_partition_id", { + long partition_id = rowset_meta_pb.partition_id(); + auto& rs_pb = const_cast&>(rowset_meta_pb); + rs_pb.set_partition_id(0); + LOG(WARNING) << "set debug point RowsetMetaManager::save::zero_partition_id old=" + << partition_id << " new=" << rowset_meta_pb.DebugString(); + }); if (enable_binlog) { return _save_with_binlog(meta, tablet_uid, rowset_id, rowset_meta_pb); } else { - return save(meta, tablet_uid, rowset_id, rowset_meta_pb); + return _save(meta, tablet_uid, rowset_id, rowset_meta_pb); } } -Status RowsetMetaManager::save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, - const RowsetMetaPB& rowset_meta_pb) { +Status RowsetMetaManager::_save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb) { std::string key = fmt::format("{}{}_{}", ROWSET_PREFIX, tablet_uid.to_string(), rowset_id.to_string()); std::string value; @@ -523,7 +531,7 @@ Status RowsetMetaManager::load_json_rowset_meta(OlapMeta* meta, } RowsetId rowset_id = rowset_meta.rowset_id(); TabletUid tablet_uid = rowset_meta.tablet_uid(); - Status status = save(meta, tablet_uid, rowset_id, rowset_meta.get_rowset_pb()); + Status status = save(meta, tablet_uid, rowset_id, rowset_meta.get_rowset_pb(), false); return status; } diff --git a/be/src/olap/rowset/rowset_meta_manager.h b/be/src/olap/rowset/rowset_meta_manager.h index 31f3e6674f..9517ce3f51 100644 --- a/be/src/olap/rowset/rowset_meta_manager.h +++ b/be/src/olap/rowset/rowset_meta_manager.h @@ -54,8 +54,6 @@ public: // TODO(Drogon): refactor save && _save_with_binlog to one, adapt to ut temperately static Status save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, const RowsetMetaPB& rowset_meta_pb, bool enable_binlog); - static Status save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, - const RowsetMetaPB& rowset_meta_pb); static std::vector get_binlog_filenames(OlapMeta* meta, TabletUid tablet_uid, std::string_view binlog_version, @@ -83,6 +81,8 @@ public: static Status load_json_rowset_meta(OlapMeta* meta, const std::string& rowset_meta_path); private: + static Status _save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb); static Status _save_with_binlog(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, const RowsetMetaPB& rowset_meta_pb); static Status _get_rowset_binlog_metas(OlapMeta* meta, const TabletUid tablet_uid, diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 7f3b61c6f1..b5eacce1a3 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -832,10 +832,10 @@ Status TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tablet_ tablet_meta->set_tablet_state(TABLET_RUNNING); } - if (tablet_meta->partition_id() <= 0) { - LOG(WARNING) << "invalid partition id " << tablet_meta->partition_id() << ", tablet " - << tablet_meta->tablet_id(); + if (tablet_meta->partition_id() == 0) { + LOG(WARNING) << "tablet=" << tablet_id << " load from meta but partition id eq 0"; } + TabletSharedPtr tablet = std::make_shared(std::move(tablet_meta), data_dir); // NOTE: method load_tablet_from_meta could be called by two cases as below diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index fc4a22a617..3246e947d1 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -37,6 +37,7 @@ #include "olap/olap_define.h" #include "olap/tablet_meta_manager.h" #include "olap/utils.h" +#include "util/debug_points.h" #include "util/string_util.h" #include "util/time.h" #include "util/uid_util.h" @@ -468,6 +469,16 @@ Status TabletMeta::_save_meta(DataDir* data_dir) { Status TabletMeta::serialize(string* meta_binary) { TabletMetaPB tablet_meta_pb; to_meta_pb(&tablet_meta_pb); + if (tablet_meta_pb.partition_id() <= 0) { + LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet " + << tablet_meta_pb.tablet_id(); + } + DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", { + long partition_id = tablet_meta_pb.partition_id(); + tablet_meta_pb.set_partition_id(0); + LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old=" + << partition_id << " new=" << tablet_meta_pb.DebugString(); + }); bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary); if (!serialize_success) { LOG(FATAL) << "failed to serialize meta " << tablet_id(); diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index 90c277d8ad..5cec68d75b 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -327,6 +327,13 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, do { // get tx std::shared_lock rdlock(_get_txn_map_lock(transaction_id)); + auto rs_pb = rowset_ptr->rowset_meta()->get_rowset_pb(); + // TODO(dx): remove log after fix partition id eq 0 bug + if (!rs_pb.has_partition_id() || rs_pb.partition_id() == 0) { + rowset_ptr->rowset_meta()->set_partition_id(partition_id); + LOG(WARNING) << "cant get partition id from rs pb, get from func arg partition_id=" + << partition_id; + } txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id); auto it = txn_tablet_map.find(key); if (it == txn_tablet_map.end()) { @@ -374,8 +381,9 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, // save meta need access disk, it maybe very slow, so that it is not in global txn lock // it is under a single txn lock if (!is_recovery) { - Status save_status = RowsetMetaManager::save(meta, tablet_uid, rowset_ptr->rowset_id(), - rowset_ptr->rowset_meta()->get_rowset_pb()); + Status save_status = + RowsetMetaManager::save(meta, tablet_uid, rowset_ptr->rowset_id(), + rowset_ptr->rowset_meta()->get_rowset_pb(), false); DBUG_EXECUTE_IF("TxnManager.RowsetMetaManager.save_wait", { if (auto wait = dp->param("duration", 0); wait > 0) { LOG_WARNING("TxnManager.RowsetMetaManager.save_wait").tag("wait ms", wait); diff --git a/be/test/olap/path_gc_test.cpp b/be/test/olap/path_gc_test.cpp index c1b28bf31d..985ac63257 100644 --- a/be/test/olap/path_gc_test.cpp +++ b/be/test/olap/path_gc_test.cpp @@ -176,7 +176,7 @@ TEST(PathGcTest, GcTabletAndRowset) { st = create_rowset_files(*rs, false); ASSERT_TRUE(st.ok()) << st; st = RowsetMetaManager::save(data_dir.get_meta(), rs->rowset_meta()->tablet_uid(), - rs->rowset_id(), rs->rowset_meta()->get_rowset_pb()); + rs->rowset_id(), rs->rowset_meta()->get_rowset_pb(), false); ASSERT_TRUE(st.ok()) << st; } // Prepare garbage rowset files diff --git a/be/test/olap/rowset/rowset_meta_manager_test.cpp b/be/test/olap/rowset/rowset_meta_manager_test.cpp index e27e848ed4..c4d49d8c28 100644 --- a/be/test/olap/rowset/rowset_meta_manager_test.cpp +++ b/be/test/olap/rowset/rowset_meta_manager_test.cpp @@ -106,7 +106,7 @@ TEST_F(RowsetMetaManagerTest, TestSaveAndGetAndRemove) { EXPECT_EQ(rowset_meta.rowset_id(), rowset_id); RowsetMetaPB rowset_meta_pb; rowset_meta.to_rowset_pb(&rowset_meta_pb); - Status status = RowsetMetaManager::save(_meta, _tablet_uid, rowset_id, rowset_meta_pb); + Status status = RowsetMetaManager::save(_meta, _tablet_uid, rowset_id, rowset_meta_pb, false); EXPECT_TRUE(status == Status::OK()); EXPECT_TRUE(RowsetMetaManager::check_rowset_meta(_meta, _tablet_uid, rowset_id)); std::string json_rowset_meta_read;