[fix](transaction) Fix concurrent schema change and txn cause dead lock (#26428)
Concurrent schema change and txn may cause dead lock. An example: Txn T commit but not publish; Run schema change or rollup on T's related partition, add alter replica R; sc/rollup add a sched txn watermark M; Restart fe; After fe restart, T's loadedTblIndexes will clear because it's not save to disk; T will publish version to all tablet, including sc/rollup's new alter replica R; Since R not contains txn data, so the T will fail. It will then always waitting for R's data; sc/rollup wait for txn before M to finish, only after that it will let R copy history data; Since T's not finished, so sc/rollup will always wait, so R will nerver copy history data; Txn T and sc/rollup will wait each other forever, cause dead lock; Fix: because sc/rollup will ensure double write after the sched watermark M, so for finish transaction, when checking a alter replica: if txn id is bigger than M, check it just like a normal replica; otherwise skip check this replica, the BE will modify history data later.
This commit is contained in:
@ -89,6 +89,11 @@ public abstract class AlterJobV2 implements Writable {
|
||||
@SerializedName(value = "rawSql")
|
||||
protected String rawSql;
|
||||
|
||||
// The job will wait all transactions before this txn id finished, then send the schema_change/rollup tasks.
|
||||
@SerializedName(value = "watershedTxnId")
|
||||
protected long watershedTxnId = -1;
|
||||
|
||||
|
||||
public AlterJobV2(String rawSql, long jobId, JobType jobType, long dbId, long tableId, String tableName,
|
||||
long timeoutMs) {
|
||||
this.rawSql = rawSql;
|
||||
@ -135,6 +140,10 @@ public abstract class AlterJobV2 implements Writable {
|
||||
return tableName;
|
||||
}
|
||||
|
||||
public long getWatershedTxnId() {
|
||||
return watershedTxnId;
|
||||
}
|
||||
|
||||
public boolean isTimeout() {
|
||||
return System.currentTimeMillis() - createTimeMs > timeoutMs;
|
||||
}
|
||||
|
||||
@ -134,10 +134,6 @@ public class RollupJobV2 extends AlterJobV2 implements GsonPostProcessable {
|
||||
@SerializedName(value = "storageFormat")
|
||||
private TStorageFormat storageFormat = TStorageFormat.DEFAULT;
|
||||
|
||||
// The rollup job will wait all transactions before this txn id finished, then send the rollup tasks.
|
||||
@SerializedName(value = "watershedTxnId")
|
||||
protected long watershedTxnId = -1;
|
||||
|
||||
// save all create rollup tasks
|
||||
private AgentBatchTask rollupBatchTask = new AgentBatchTask();
|
||||
// save failed task after retry three times, tabletId -> agentTask
|
||||
|
||||
@ -126,9 +126,6 @@ public class SchemaChangeJobV2 extends AlterJobV2 {
|
||||
@SerializedName(value = "indexes")
|
||||
private List<Index> indexes = null;
|
||||
|
||||
// The schema change job will wait all transactions before this txn id finished, then send the schema change tasks.
|
||||
@SerializedName(value = "watershedTxnId")
|
||||
protected long watershedTxnId = -1;
|
||||
@SerializedName(value = "storageFormat")
|
||||
private TStorageFormat storageFormat = TStorageFormat.DEFAULT;
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
package org.apache.doris.transaction;
|
||||
|
||||
import org.apache.doris.alter.AlterJobV2;
|
||||
import org.apache.doris.catalog.Database;
|
||||
import org.apache.doris.catalog.DatabaseIf;
|
||||
import org.apache.doris.catalog.Env;
|
||||
@ -535,6 +536,7 @@ public class DatabaseTransactionMgr {
|
||||
transactionState.prolongPublishTimeout();
|
||||
}
|
||||
|
||||
// (TODO): ignore the alter index if txn id is less than sc sched watermark
|
||||
int loadRequiredReplicaNum = table.getLoadRequiredReplicaNum(partition.getId());
|
||||
for (MaterializedIndex index : allIndices) {
|
||||
for (Tablet tablet : index.getTablets()) {
|
||||
@ -553,6 +555,7 @@ public class DatabaseTransactionMgr {
|
||||
throw new TransactionCommitFailedException("could not find replica for tablet ["
|
||||
+ tabletId + "], backend [" + tabletBackend + "]");
|
||||
}
|
||||
|
||||
// if the tablet have no replica's to commit or the tablet is a rolling up tablet,
|
||||
// the commit backends maybe null
|
||||
// if the commit backends is null, set all replicas as error replicas
|
||||
@ -985,6 +988,7 @@ public class DatabaseTransactionMgr {
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean alterReplicaLoadedTxn = isAlterReplicaLoadedTxn(transactionId, table);
|
||||
Iterator<PartitionCommitInfo> partitionCommitInfoIterator
|
||||
= tableCommitInfo.getIdToPartitionCommitInfo().values().iterator();
|
||||
while (partitionCommitInfoIterator.hasNext()) {
|
||||
@ -1037,7 +1041,7 @@ public class DatabaseTransactionMgr {
|
||||
tabletWriteFailedReplicas.clear();
|
||||
tabletVersionFailedReplicas.clear();
|
||||
for (Replica replica : tablet.getReplicas()) {
|
||||
checkReplicaContinuousVersionSucc(tablet.getId(), replica,
|
||||
checkReplicaContinuousVersionSucc(tablet.getId(), replica, alterReplicaLoadedTxn,
|
||||
partitionCommitInfo.getVersion(), publishTasks.get(replica.getBackendId()),
|
||||
errorReplicaIds, tabletSuccReplicas, tabletWriteFailedReplicas,
|
||||
tabletVersionFailedReplicas);
|
||||
@ -1132,8 +1136,24 @@ public class DatabaseTransactionMgr {
|
||||
LOG.info("finish transaction {} successfully, publish result: {}", transactionState, publishResult.name());
|
||||
}
|
||||
|
||||
private void checkReplicaContinuousVersionSucc(long tabletId, Replica replica, long version,
|
||||
PublishVersionTask backendPublishTask, Set<Long> errorReplicaIds, List<Replica> tabletSuccReplicas,
|
||||
private boolean isAlterReplicaLoadedTxn(long transactionId, OlapTable table) {
|
||||
List<AlterJobV2> unfinishedAlterJobs = null;
|
||||
if (table.getState() == OlapTable.OlapTableState.SCHEMA_CHANGE) {
|
||||
unfinishedAlterJobs = Env.getCurrentEnv().getAlterInstance().getSchemaChangeHandler()
|
||||
.getUnfinishedAlterJobV2ByTableId(table.getId());
|
||||
} else if (table.getState() == OlapTable.OlapTableState.ROLLUP) {
|
||||
unfinishedAlterJobs = Env.getCurrentEnv().getAlterInstance().getMaterializedViewHandler()
|
||||
.getUnfinishedAlterJobV2ByTableId(table.getId());
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
|
||||
return unfinishedAlterJobs.stream().allMatch(job -> transactionId > job.getWatershedTxnId());
|
||||
}
|
||||
|
||||
private void checkReplicaContinuousVersionSucc(long tabletId, Replica replica, boolean alterReplicaLoadedTxn,
|
||||
long version, PublishVersionTask backendPublishTask,
|
||||
Set<Long> errorReplicaIds, List<Replica> tabletSuccReplicas,
|
||||
List<Replica> tabletWriteFailedReplicas, List<Replica> tabletVersionFailedReplicas) {
|
||||
if (backendPublishTask == null || !backendPublishTask.isFinished()) {
|
||||
errorReplicaIds.add(replica.getId());
|
||||
@ -1155,6 +1175,17 @@ public class DatabaseTransactionMgr {
|
||||
}
|
||||
}
|
||||
|
||||
// Schema change and rollup has a sched watermark,
|
||||
// it's ensure that alter replicas will load those txns whose txn id > sched watermark.
|
||||
// But for txns before the sched watermark, the alter replicas maynot load the txns,
|
||||
// publish will ignore checking them and treat them as success in advance.
|
||||
// Later be will fill the alter replicas's history data which before sched watermark.
|
||||
// If failed to fill, fe will set the alter replica bad.
|
||||
if (replica.getState() == Replica.ReplicaState.ALTER
|
||||
&& (!alterReplicaLoadedTxn || !Config.publish_version_check_alter_replica)) {
|
||||
errorReplicaIds.remove(replica.getId());
|
||||
}
|
||||
|
||||
if (!errorReplicaIds.contains(replica.getId())) {
|
||||
if (replica.checkVersionCatchUp(version - 1, true)) {
|
||||
tabletSuccReplicas.add(replica);
|
||||
|
||||
Reference in New Issue
Block a user