Add protection for too much sstable and wrs

This commit is contained in:
Handora 2023-04-04 08:15:01 +00:00 committed by ob-robot
parent caf2783ed1
commit d2f227903e
3 changed files with 75 additions and 7 deletions

View File

@ -32,6 +32,7 @@ ObMultiVersionGarbageCollector::ObMultiVersionGarbageCollector()
last_study_timestamp_(0),
last_refresh_timestamp_(0),
last_reclaim_timestamp_(0),
last_sstable_overflow_timestamp_(0),
has_error_when_study_(false),
refresh_error_too_long_(false),
has_error_when_reclaim_(false),
@ -56,6 +57,7 @@ int ObMultiVersionGarbageCollector::init()
last_study_timestamp_ = 0;
last_refresh_timestamp_ = 0;
last_reclaim_timestamp_ = 0;
last_sstable_overflow_timestamp_ = 0;
has_error_when_study_ = false;
refresh_error_too_long_ = false;
has_error_when_reclaim_ = false;
@ -72,6 +74,7 @@ void ObMultiVersionGarbageCollector::cure()
last_study_timestamp_ = 0;
last_refresh_timestamp_ = 0;
last_reclaim_timestamp_ = 0;
last_sstable_overflow_timestamp_ = 0;
has_error_when_study_ = false;
refresh_error_too_long_ = false;
has_error_when_reclaim_ = false;
@ -134,6 +137,7 @@ int ObMultiVersionGarbageCollector::stop()
last_study_timestamp_ = 0;
last_refresh_timestamp_ = 0;
last_reclaim_timestamp_ = 0;
last_sstable_overflow_timestamp_ = 0;
has_error_when_study_ = false;
refresh_error_too_long_ = false;
has_error_when_reclaim_ = false;
@ -536,7 +540,8 @@ int ObMultiVersionGarbageCollector::refresh_()
// Step3: cache the reserved snapshot of active txn for future use.
// NB: be care of the lower value and maximum value which is not reasonable
decide_reserved_snapshot_version_(collector.get_reserved_snapshot_version());
decide_reserved_snapshot_version_(collector.get_reserved_snapshot_version(),
collector.get_reserved_snapshot_type());
timeguard.click("decide_reserved_snapshot_");
@ -561,7 +566,8 @@ void ObMultiVersionGarbageCollector::decide_gc_status_(const ObMultiVersionGCSta
}
void ObMultiVersionGarbageCollector::decide_reserved_snapshot_version_(
const share::SCN reserved_snapshot)
const share::SCN reserved_snapshot,
const ObMultiVersionSnapshotType reserved_type)
{
int ret = OB_SUCCESS;
@ -575,12 +581,26 @@ void ObMultiVersionGarbageCollector::decide_reserved_snapshot_version_(
// We ignore the reserved snapshot with too late snapshot and report WARN
// because there may be servers offline and online suddenly and report a
// stale txn version. And we report error for a too too old snapshot.
if ((global_reserved_snapshot_.get_val_for_tx() -
reserved_snapshot.get_val_for_tx()) / 1000 > 100 * 1_min) {
// NB: There may be WRS service which disables the monotonic weak read and
// finally causes the timestamp to go back, so we should ignore it.
if (ObMultiVersionSnapshotType::MIN_UNALLOCATED_WRS == reserved_type
&& !transaction::ObWeakReadUtil::enable_monotonic_weak_read(MTL_ID())) {
MVCC_LOG(WARN, "update a smaller reserved snapshot with wrs disable monotonic weak read",
K(ret), KPC(this), K(global_reserved_snapshot_), K(reserved_snapshot));
} else if (ObMultiVersionSnapshotType::MIN_UNALLOCATED_WRS == reserved_type
&& ((global_reserved_snapshot_.get_val_for_tx() -
reserved_snapshot.get_val_for_tx()) / 1000 >
MAX(transaction::ObWeakReadUtil::max_stale_time_for_weak_consistency(MTL_ID()),
100 * 1_min))) {
MVCC_LOG(ERROR, "update a too too smaller reserved snapshot with wrs!!!",
K(ret), KPC(this), K(global_reserved_snapshot_), K(reserved_snapshot),
K(transaction::ObWeakReadUtil::max_stale_time_for_weak_consistency(MTL_ID())));
} else if ((global_reserved_snapshot_.get_val_for_tx() -
reserved_snapshot.get_val_for_tx()) / 1000 > 100 * 1_min) {
MVCC_LOG(ERROR, "update a too too smaller reserved snapshot!!!", K(ret), KPC(this),
K(global_reserved_snapshot_), K(reserved_snapshot));
} else {
MVCC_LOG(WARN, "update a too too smaller reserved snapshot!", K(ret), KPC(this),
MVCC_LOG(WARN, "update a too smaller reserved snapshot!", K(ret), KPC(this),
K(global_reserved_snapshot_), K(reserved_snapshot));
}
} else {
@ -1131,7 +1151,9 @@ int ObMultiVersionGarbageCollector::is_disk_almost_full_(bool &is_almost_full)
is_almost_full = false;
const int64_t required_size = 0;
if (OB_FAIL(THE_IO_DEVICE->check_space_full(required_size))) {
// Case1: io device is almost full
if (!is_almost_full
&& OB_FAIL(THE_IO_DEVICE->check_space_full(required_size))) {
if (OB_SERVER_OUTOF_DISK_SPACE == ret) {
ret = OB_SUCCESS;
is_almost_full = true;
@ -1141,9 +1163,38 @@ int ObMultiVersionGarbageCollector::is_disk_almost_full_(bool &is_almost_full)
}
}
// Case2: sstable is overflow during merge
if (!is_almost_full
&& is_sstable_overflow_()) {
is_almost_full = true;
MVCC_LOG(WARN, "disk is almost full, we should give up", KPC(this));
}
return ret;
}
void ObMultiVersionGarbageCollector::report_sstable_overflow()
{
const int64_t current_timestamp = common::ObTimeUtility::current_time();
ATOMIC_STORE(&last_sstable_overflow_timestamp_, current_timestamp);
MVCC_LOG_RET(WARN, OB_SIZE_OVERFLOW, "sstable is alomost overflow, we should give up", KPC(this));
}
bool ObMultiVersionGarbageCollector::is_sstable_overflow_()
{
bool b_ret = false;
const int64_t current_timestamp = common::ObTimeUtility::current_time();
const int64_t last_sstable_overflow_timestamp = ATOMIC_LOAD(&last_sstable_overflow_timestamp_);
if (0 != last_sstable_overflow_timestamp
&& current_timestamp >= last_sstable_overflow_timestamp
// We currenly think that there may be a disk full problem if there exists
// an sstable overflow error within 5 minutes
&& current_timestamp - last_sstable_overflow_timestamp <= 5 * 1_min) {
b_ret = true;
}
return b_ret;
}
ObMultiVersionGCSnapshotCalculator::ObMultiVersionGCSnapshotCalculator()
: reserved_snapshot_version_(share::SCN::max_scn()),
reserved_snapshot_type_(ObMultiVersionSnapshotType::MIN_SNAPSHOT_TYPE),
@ -1211,6 +1262,11 @@ share::SCN ObMultiVersionGCSnapshotCalculator::get_reserved_snapshot_version() c
return reserved_snapshot_version_;
}
ObMultiVersionSnapshotType ObMultiVersionGCSnapshotCalculator::get_reserved_snapshot_type() const
{
return reserved_snapshot_type_;
}
ObMultiVersionGCStatus ObMultiVersionGCSnapshotCalculator::get_status() const
{
return status_;

View File

@ -179,6 +179,7 @@ public:
const int64_t create_time,
const ObAddr addr);
share::SCN get_reserved_snapshot_version() const;
ObMultiVersionSnapshotType get_reserved_snapshot_type() const;
ObMultiVersionGCStatus get_status() const;
bool is_this_server_disabled() const
{ return is_this_server_disabled_; }
@ -312,6 +313,9 @@ public:
// get_reserved_snapshot_for_active_txn fetch the cached globally reserved
// snapshot if updated in time, otherwise max_scn() is used for available
share::SCN get_reserved_snapshot_for_active_txn() const;
// report_sstable_overflow marks the last sstable's overflow events and we
// will use it to disable mvcc gc
void report_sstable_overflow();
// is_gc_disabled shows the global gc status of whether the gc is disabled
bool is_gc_disabled() const;
@ -319,6 +323,7 @@ public:
K_(last_study_timestamp),
K_(last_refresh_timestamp),
K_(last_reclaim_timestamp),
K_(last_sstable_overflow_timestamp),
K_(has_error_when_study),
K_(refresh_error_too_long),
K_(has_error_when_reclaim),
@ -342,8 +347,10 @@ private:
int study_max_committed_txn_version(share::SCN &max_committed_txn_version);
int study_min_active_txn_version(share::SCN &min_active_txn_version);
int is_disk_almost_full_(bool &is_almost_full);
bool is_sstable_overflow_();
void decide_gc_status_(const ObMultiVersionGCStatus gc_status);
void decide_reserved_snapshot_version_(const share::SCN reserved_snapshot);
void decide_reserved_snapshot_version_(const share::SCN reserved_snapshot,
const ObMultiVersionSnapshotType reserved_type);
// ============== for test ================
OB_NOINLINE bool can_report();
@ -355,6 +362,8 @@ private:
int64_t last_study_timestamp_;
int64_t last_refresh_timestamp_;
int64_t last_reclaim_timestamp_;
// last timestamp sstable reports overflow during merge
int64_t last_sstable_overflow_timestamp_;
bool has_error_when_study_;
// refresh too long without contacting inner table successfully.
// It may be caused by inner table majority crash or network issues.

View File

@ -23,6 +23,7 @@
#include "lib/container/ob_array_iterator.h"
#include "storage/meta_mem/ob_tablet_pointer.h"
#include "storage/ddl/ob_tablet_ddl_kv.h"
#include "storage/concurrency_control/ob_multi_version_garbage_collector.h"
using namespace oceanbase;
using namespace oceanbase::blocksstable;
@ -1238,9 +1239,11 @@ int ObTabletTableStore::check_ready_for_read()
} else if (minor_tables_.count() + 1 > MAX_SSTABLE_CNT_IN_STORAGE) {
ret = OB_SIZE_OVERFLOW;
LOG_WARN("Too Many sstables in table store", K(ret), KPC(this), KPC(tablet_ptr_));
MTL(concurrency_control::ObMultiVersionGarbageCollector *)->report_sstable_overflow();
} else if (get_table_count() > ObTabletTableStore::MAX_SSTABLE_CNT) {
ret = OB_SIZE_OVERFLOW;
LOG_WARN("Too Many sstables, cannot add another sstable any more", K(ret), KPC(this), KPC(tablet_ptr_));
MTL(concurrency_control::ObMultiVersionGarbageCollector *)->report_sstable_overflow();
ObPartitionMergePolicy::diagnose_table_count_unsafe(MAJOR_MERGE, *tablet_ptr_);
} else if (minor_tables_.empty()) {
is_ready_for_read_ = true;