[improvement](transaction) make commit txn fail hint more understandable (#23227)
This commit is contained in:
@ -410,7 +410,8 @@ public class OlapTableSink extends DataSink {
|
||||
Multimap<Long, Long> bePathsMap = tablet.getNormalReplicaBackendPathMap();
|
||||
if (bePathsMap.keySet().size() < quorum) {
|
||||
throw new UserException(InternalErrorCode.REPLICA_FEW_ERR,
|
||||
"tablet " + tablet.getId() + " has few replicas: " + bePathsMap.keySet().size()
|
||||
"tablet " + tablet.getId() + " alive replica num " + bePathsMap.keySet().size()
|
||||
+ " < quorum replica num " + quorum
|
||||
+ ", alive backends: [" + StringUtils.join(bePathsMap.keySet(), ",") + "]");
|
||||
}
|
||||
|
||||
|
||||
@ -81,6 +81,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@ -434,6 +435,7 @@ public class DatabaseTransactionMgr {
|
||||
Set<Long> errorReplicaIds, Map<Long, Set<Long>> tableToPartition,
|
||||
Set<Long> totalInvolvedBackends) throws UserException {
|
||||
|
||||
long transactionId = transactionState.getTransactionId();
|
||||
Database db = env.getInternalCatalog().getDbOrMetaException(dbId);
|
||||
|
||||
// update transaction state extra if exists
|
||||
@ -490,6 +492,33 @@ public class DatabaseTransactionMgr {
|
||||
}
|
||||
tabletToBackends.get(tabletId).add(tabletCommitInfos.get(i).getBackendId());
|
||||
}
|
||||
List<String> tabletSuccReplicas = Lists.newArrayList();
|
||||
List<String> tabletWriteFailedReplicas = Lists.newArrayList();
|
||||
List<String> tabletVersionFailedReplicas = Lists.newArrayList();
|
||||
Function<Replica, String> getReplicaInfo = replica -> {
|
||||
StringBuilder strBuffer = new StringBuilder("[replicaId=");
|
||||
strBuffer.append(replica.getId());
|
||||
strBuffer.append(", backendId=");
|
||||
strBuffer.append(replica.getBackendId());
|
||||
strBuffer.append(", backendAlive=");
|
||||
strBuffer.append(Env.getCurrentSystemInfo().checkBackendAlive(replica.getBackendId()));
|
||||
strBuffer.append(", version=");
|
||||
strBuffer.append(replica.getVersion());
|
||||
if (replica.getLastFailedVersion() >= 0) {
|
||||
strBuffer.append(", lastFailedVersion=");
|
||||
strBuffer.append(replica.getLastFailedVersion());
|
||||
strBuffer.append(", lastSuccessVersion=");
|
||||
strBuffer.append(replica.getLastSuccessVersion());
|
||||
strBuffer.append(", lastFailedTimestamp=");
|
||||
strBuffer.append(replica.getLastFailedTimestamp());
|
||||
}
|
||||
strBuffer.append(", state=");
|
||||
strBuffer.append(replica.getState().name());
|
||||
strBuffer.append("]");
|
||||
|
||||
return strBuffer.toString();
|
||||
};
|
||||
|
||||
for (long tableId : tableToPartition.keySet()) {
|
||||
OlapTable table = (OlapTable) db.getTableOrMetaException(tableId);
|
||||
for (Partition partition : table.getAllPartitions()) {
|
||||
@ -533,6 +562,9 @@ public class DatabaseTransactionMgr {
|
||||
.getReplicaAllocation(partition.getId()).getTotalReplicaNum() / 2 + 1;
|
||||
for (MaterializedIndex index : allIndices) {
|
||||
for (Tablet tablet : index.getTablets()) {
|
||||
tabletSuccReplicas.clear();
|
||||
tabletWriteFailedReplicas.clear();
|
||||
tabletVersionFailedReplicas.clear();
|
||||
int successReplicaNum = 0;
|
||||
long tabletId = tablet.getId();
|
||||
Set<Long> tabletBackends = tablet.getBackendIds();
|
||||
@ -558,11 +590,14 @@ public class DatabaseTransactionMgr {
|
||||
// for example, a replica is in clone state
|
||||
if (replica.getLastFailedVersion() < 0) {
|
||||
++successReplicaNum;
|
||||
tabletSuccReplicas.add(getReplicaInfo.apply(replica));
|
||||
} else {
|
||||
errorReplicaInfo += " replica [" + replica.getId() + "], lastFailedVersion ["
|
||||
+ replica.getLastFailedVersion() + "]";
|
||||
tabletVersionFailedReplicas.add(getReplicaInfo.apply(replica));
|
||||
}
|
||||
} else {
|
||||
tabletWriteFailedReplicas.add(getReplicaInfo.apply(replica));
|
||||
errorBackendIdsForTablet.add(tabletBackend);
|
||||
errorReplicaIds.add(replica.getId());
|
||||
// not remove rollup task here, because the commit maybe failed
|
||||
@ -580,9 +615,29 @@ public class DatabaseTransactionMgr {
|
||||
transactionState.getTransactionId(), tablet.getId(), successReplicaNum,
|
||||
quorumReplicaNum, Joiner.on(",").join(errorBackendIdsForTablet),
|
||||
errorReplicaInfo, commitBackends);
|
||||
throw new TabletQuorumFailedException(transactionState.getTransactionId(), tablet.getId(),
|
||||
successReplicaNum, quorumReplicaNum,
|
||||
errorBackendIdsForTablet);
|
||||
|
||||
String replicasDetailMsg = "";
|
||||
if (!tabletSuccReplicas.isEmpty()) {
|
||||
replicasDetailMsg += String.format("%s replicas final succ: { %s }; ",
|
||||
tabletSuccReplicas.size(), Joiner.on(", ").join(tabletSuccReplicas));
|
||||
}
|
||||
if (!tabletWriteFailedReplicas.isEmpty()) {
|
||||
replicasDetailMsg += String.format("%s replicas write data failed: { %s }; ",
|
||||
tabletWriteFailedReplicas.size(),
|
||||
Joiner.on(", ").join(tabletWriteFailedReplicas));
|
||||
}
|
||||
if (!tabletVersionFailedReplicas.isEmpty()) {
|
||||
replicasDetailMsg += String.format("%s replicas write data succ but miss previous "
|
||||
+ "version: { %s }.",
|
||||
tabletVersionFailedReplicas.size(),
|
||||
Joiner.on(", ").join(tabletVersionFailedReplicas));
|
||||
}
|
||||
|
||||
throw new TabletQuorumFailedException(transactionId, String.format(
|
||||
"Failed to commit txn %s, cause tablet %s succ replica num %s < quorum "
|
||||
+ " replica num %s. table %s, partition %s, this tablet detail: %s",
|
||||
transactionId, tablet.getId(), successReplicaNum, quorumReplicaNum, tableId,
|
||||
partition.getId(), replicasDetailMsg));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,27 +17,8 @@
|
||||
|
||||
package org.apache.doris.transaction;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public class TabletQuorumFailedException extends TransactionException {
|
||||
|
||||
private static final String TABLET_QUORUM_FAILED_MSG = "Failed to commit txn %s. "
|
||||
+ "Tablet [%s] success replica num %s is less than quorum "
|
||||
+ "replica num %s while error backends %s";
|
||||
|
||||
private long tabletId;
|
||||
private Set<Long> errorBackendIdsForTablet = Sets.newHashSet();
|
||||
|
||||
public TabletQuorumFailedException(long transactionId, long tabletId,
|
||||
int successReplicaNum, int quorumReplicaNum,
|
||||
Set<Long> errorBackendIdsForTablet) {
|
||||
super(String.format(TABLET_QUORUM_FAILED_MSG, transactionId, tabletId,
|
||||
successReplicaNum, quorumReplicaNum,
|
||||
Joiner.on(",").join(errorBackendIdsForTablet)), transactionId);
|
||||
this.tabletId = tabletId;
|
||||
this.errorBackendIdsForTablet = errorBackendIdsForTablet;
|
||||
public TabletQuorumFailedException(long transactionId, String message) {
|
||||
super(message, transactionId);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user