[fix](transaction) Fix concurrent schema change and txn cause dead lock (#26428)

Concurrent schema change and txn may cause dead lock. An example:

Txn T commit but not publish;
Run schema change or rollup on T's related partition, add alter replica R;
sc/rollup add a sched txn watermark M;
Restart fe;
After fe restart, T's loadedTblIndexes will clear because it's not save to disk;
T will publish version to all tablet, including sc/rollup's new alter replica R;
Since R not contains txn data, so the T will fail. It will then always waitting for R's data;
sc/rollup wait for txn before M to finish, only after that it will let R copy history data;
Since T's not finished, so sc/rollup will always wait, so R will nerver copy history data;
Txn T and sc/rollup will wait each other forever, cause dead lock;
Fix: because sc/rollup will ensure double write after the sched watermark M, so for finish transaction, when checking a alter replica:

if txn id is bigger than M, check it just like a normal replica;
otherwise skip check this replica, the BE will modify history data later.
This commit is contained in:
yujun
2023-11-13 21:39:28 +08:00
committed by GitHub
parent 7b50a62f0c
commit ebc15fc6cc
12 changed files with 306 additions and 22 deletions

View File

@ -89,6 +89,11 @@ public abstract class AlterJobV2 implements Writable {
@SerializedName(value = "rawSql")
protected String rawSql;
// The job will wait all transactions before this txn id finished, then send the schema_change/rollup tasks.
@SerializedName(value = "watershedTxnId")
protected long watershedTxnId = -1;
public AlterJobV2(String rawSql, long jobId, JobType jobType, long dbId, long tableId, String tableName,
long timeoutMs) {
this.rawSql = rawSql;
@ -135,6 +140,10 @@ public abstract class AlterJobV2 implements Writable {
return tableName;
}
public long getWatershedTxnId() {
return watershedTxnId;
}
public boolean isTimeout() {
return System.currentTimeMillis() - createTimeMs > timeoutMs;
}

View File

@ -134,10 +134,6 @@ public class RollupJobV2 extends AlterJobV2 implements GsonPostProcessable {
@SerializedName(value = "storageFormat")
private TStorageFormat storageFormat = TStorageFormat.DEFAULT;
// The rollup job will wait all transactions before this txn id finished, then send the rollup tasks.
@SerializedName(value = "watershedTxnId")
protected long watershedTxnId = -1;
// save all create rollup tasks
private AgentBatchTask rollupBatchTask = new AgentBatchTask();
// save failed task after retry three times, tabletId -> agentTask

View File

@ -126,9 +126,6 @@ public class SchemaChangeJobV2 extends AlterJobV2 {
@SerializedName(value = "indexes")
private List<Index> indexes = null;
// The schema change job will wait all transactions before this txn id finished, then send the schema change tasks.
@SerializedName(value = "watershedTxnId")
protected long watershedTxnId = -1;
@SerializedName(value = "storageFormat")
private TStorageFormat storageFormat = TStorageFormat.DEFAULT;

View File

@ -17,6 +17,7 @@
package org.apache.doris.transaction;
import org.apache.doris.alter.AlterJobV2;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DatabaseIf;
import org.apache.doris.catalog.Env;
@ -535,6 +536,7 @@ public class DatabaseTransactionMgr {
transactionState.prolongPublishTimeout();
}
// (TODO): ignore the alter index if txn id is less than sc sched watermark
int loadRequiredReplicaNum = table.getLoadRequiredReplicaNum(partition.getId());
for (MaterializedIndex index : allIndices) {
for (Tablet tablet : index.getTablets()) {
@ -553,6 +555,7 @@ public class DatabaseTransactionMgr {
throw new TransactionCommitFailedException("could not find replica for tablet ["
+ tabletId + "], backend [" + tabletBackend + "]");
}
// if the tablet have no replica's to commit or the tablet is a rolling up tablet,
// the commit backends maybe null
// if the commit backends is null, set all replicas as error replicas
@ -985,6 +988,7 @@ public class DatabaseTransactionMgr {
continue;
}
boolean alterReplicaLoadedTxn = isAlterReplicaLoadedTxn(transactionId, table);
Iterator<PartitionCommitInfo> partitionCommitInfoIterator
= tableCommitInfo.getIdToPartitionCommitInfo().values().iterator();
while (partitionCommitInfoIterator.hasNext()) {
@ -1037,7 +1041,7 @@ public class DatabaseTransactionMgr {
tabletWriteFailedReplicas.clear();
tabletVersionFailedReplicas.clear();
for (Replica replica : tablet.getReplicas()) {
checkReplicaContinuousVersionSucc(tablet.getId(), replica,
checkReplicaContinuousVersionSucc(tablet.getId(), replica, alterReplicaLoadedTxn,
partitionCommitInfo.getVersion(), publishTasks.get(replica.getBackendId()),
errorReplicaIds, tabletSuccReplicas, tabletWriteFailedReplicas,
tabletVersionFailedReplicas);
@ -1132,8 +1136,24 @@ public class DatabaseTransactionMgr {
LOG.info("finish transaction {} successfully, publish result: {}", transactionState, publishResult.name());
}
private void checkReplicaContinuousVersionSucc(long tabletId, Replica replica, long version,
PublishVersionTask backendPublishTask, Set<Long> errorReplicaIds, List<Replica> tabletSuccReplicas,
private boolean isAlterReplicaLoadedTxn(long transactionId, OlapTable table) {
List<AlterJobV2> unfinishedAlterJobs = null;
if (table.getState() == OlapTable.OlapTableState.SCHEMA_CHANGE) {
unfinishedAlterJobs = Env.getCurrentEnv().getAlterInstance().getSchemaChangeHandler()
.getUnfinishedAlterJobV2ByTableId(table.getId());
} else if (table.getState() == OlapTable.OlapTableState.ROLLUP) {
unfinishedAlterJobs = Env.getCurrentEnv().getAlterInstance().getMaterializedViewHandler()
.getUnfinishedAlterJobV2ByTableId(table.getId());
} else {
return true;
}
return unfinishedAlterJobs.stream().allMatch(job -> transactionId > job.getWatershedTxnId());
}
private void checkReplicaContinuousVersionSucc(long tabletId, Replica replica, boolean alterReplicaLoadedTxn,
long version, PublishVersionTask backendPublishTask,
Set<Long> errorReplicaIds, List<Replica> tabletSuccReplicas,
List<Replica> tabletWriteFailedReplicas, List<Replica> tabletVersionFailedReplicas) {
if (backendPublishTask == null || !backendPublishTask.isFinished()) {
errorReplicaIds.add(replica.getId());
@ -1155,6 +1175,17 @@ public class DatabaseTransactionMgr {
}
}
// Schema change and rollup has a sched watermark,
// it's ensure that alter replicas will load those txns whose txn id > sched watermark.
// But for txns before the sched watermark, the alter replicas maynot load the txns,
// publish will ignore checking them and treat them as success in advance.
// Later be will fill the alter replicas's history data which before sched watermark.
// If failed to fill, fe will set the alter replica bad.
if (replica.getState() == Replica.ReplicaState.ALTER
&& (!alterReplicaLoadedTxn || !Config.publish_version_check_alter_replica)) {
errorReplicaIds.remove(replica.getId());
}
if (!errorReplicaIds.contains(replica.getId())) {
if (replica.checkVersionCatchUp(version - 1, true)) {
tabletSuccReplicas.add(replica);