[fix](oom) avoid oom when a lot of tablets fail on load (#36944)
pick #36873
This commit is contained in:
@ -1159,6 +1159,45 @@ public class DatabaseTransactionMgr {
|
||||
return true;
|
||||
}
|
||||
|
||||
private class TabletsPublishResultLogs {
|
||||
public List<String> quorumSuccLogs = Lists.newArrayList();
|
||||
public List<String> timeoutSuccLogs = Lists.newArrayList();
|
||||
public List<String> failedLogs = Lists.newArrayList();
|
||||
|
||||
public void addQuorumSuccLog(String log) {
|
||||
if (quorumSuccLogs.size() < 16) {
|
||||
quorumSuccLogs.add(log);
|
||||
}
|
||||
}
|
||||
|
||||
public void addTimeoutSuccLog(String log) {
|
||||
if (timeoutSuccLogs.size() < 16) {
|
||||
timeoutSuccLogs.add(log);
|
||||
}
|
||||
}
|
||||
|
||||
public void addFailedLog(String log) {
|
||||
if (failedLogs.size() < 16) {
|
||||
failedLogs.add(log);
|
||||
}
|
||||
}
|
||||
|
||||
public void log() {
|
||||
// log failed logs
|
||||
for (String log : failedLogs) {
|
||||
LOG.info(log);
|
||||
}
|
||||
// log timeout succ logs
|
||||
for (String log : timeoutSuccLogs) {
|
||||
LOG.info(log);
|
||||
}
|
||||
// log quorum succ logs
|
||||
for (String log : quorumSuccLogs) {
|
||||
LOG.info(log);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private PublishResult finishCheckQuorumReplicas(TransactionState transactionState,
|
||||
List<Pair<OlapTable, Partition>> relatedTblPartitions,
|
||||
Set<Long> errorReplicaIds) {
|
||||
@ -1173,7 +1212,7 @@ public class DatabaseTransactionMgr {
|
||||
List<Replica> tabletSuccReplicas = Lists.newArrayList();
|
||||
List<Replica> tabletWriteFailedReplicas = Lists.newArrayList();
|
||||
List<Replica> tabletVersionFailedReplicas = Lists.newArrayList();
|
||||
List<String> logs = Lists.newArrayList();
|
||||
TabletsPublishResultLogs logs = new TabletsPublishResultLogs();
|
||||
|
||||
Map<Long, PublishVersionTask> publishTasks = transactionState.getPublishVersionTasks();
|
||||
PublishResult publishResult = PublishResult.QUORUM_SUCC;
|
||||
@ -1224,9 +1263,9 @@ public class DatabaseTransactionMgr {
|
||||
if (hasFailedReplica) {
|
||||
String writeDetail = getTabletWriteDetail(tabletSuccReplicas,
|
||||
tabletWriteFailedReplicas, tabletVersionFailedReplicas);
|
||||
logs.add(String.format("publish version quorum succ for transaction %s on tablet %s"
|
||||
+ " with version %s, and has failed replicas, load require replica num %s. "
|
||||
+ "table %s, partition: [ id=%s, commit version=%s ], tablet detail: %s",
|
||||
logs.addQuorumSuccLog(String.format("publish version quorum succ for transaction %s "
|
||||
+ "on tablet %s with version %s, and has failed replicas, load require replica "
|
||||
+ "num %s. table %s, partition: [ id=%s, commit version=%s ], tablet detail: %s",
|
||||
transactionState, tablet.getId(), newVersion, loadRequiredReplicaNum, tableId,
|
||||
partitionId, partition.getCommittedVersion(), writeDetail));
|
||||
}
|
||||
@ -1248,9 +1287,9 @@ public class DatabaseTransactionMgr {
|
||||
// that are being publised exists on a few replicas we should go
|
||||
// ahead, otherwise data may be lost and thre
|
||||
// publish task hangs forever.
|
||||
logs.add(String.format("publish version timeout succ for transaction %s on tablet %s "
|
||||
+ "with version %s, and has failed replicas, load require replica num %s. "
|
||||
+ "table %s, partition %s, tablet detail: %s",
|
||||
logs.addTimeoutSuccLog(String.format("publish version timeout succ for transaction %s "
|
||||
+ "on tablet %s with version %s, and has failed replicas, load require replica num %s."
|
||||
+ " table %s, partition %s, tablet detail: %s",
|
||||
transactionState, tablet.getId(), newVersion,
|
||||
loadRequiredReplicaNum, tableId, partitionId, writeDetail));
|
||||
} else {
|
||||
@ -1261,8 +1300,8 @@ public class DatabaseTransactionMgr {
|
||||
tablet.getId(), healthReplicaNum, loadRequiredReplicaNum, tableId,
|
||||
partitionId, newVersion);
|
||||
transactionState.setErrorMsg(errMsg);
|
||||
logs.add(String.format("publish version failed for transaction %s on tablet %s with version"
|
||||
+ " %s, and has failed replicas, load required replica num %s. table %s, "
|
||||
logs.addQuorumSuccLog(String.format("publish version failed for transaction %s on tablet %s "
|
||||
+ " with version %s, and has failed replicas, load required replica num %s. table %s, "
|
||||
+ "partition %s, tablet detail: %s",
|
||||
transactionState, tablet.getId(), newVersion,
|
||||
loadRequiredReplicaNum, tableId, partitionId, writeDetail));
|
||||
@ -1275,10 +1314,7 @@ public class DatabaseTransactionMgr {
|
||||
|| now - transactionState.getLastPublishLogTime() > Config.publish_fail_log_interval_second * 1000L;
|
||||
if (needLog) {
|
||||
transactionState.setLastPublishLogTime(now);
|
||||
for (String log : logs) {
|
||||
LOG.info("{}. publish times {}, whole txn publish result {}",
|
||||
log, transactionState.getPublishCount(), publishResult.name());
|
||||
}
|
||||
logs.log();
|
||||
}
|
||||
|
||||
return publishResult;
|
||||
|
||||
Reference in New Issue
Block a user