From a54cfb75588c4db5f5cccbed4464eaa1d4c5de59 Mon Sep 17 00:00:00 2001 From: xueweizhang Date: Thu, 16 Nov 2023 10:16:56 +0800 Subject: [PATCH] [fix](backup) return if status not ok and reduce summit job (#26940) when backup is prepareAndSendSnapshotTask(), if some table has error, return status not ok, but not return, and other tables continue put snapshot job into batchTask and summit jobs to be while these jobs need cancel. so when status is not ok, return and do not summit jobs --- .../org/apache/doris/backup/BackupJob.java | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java b/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java index 1d730db184..a91c2e12a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java @@ -376,17 +376,29 @@ public class BackupJob extends AbstractJob { switch (tbl.getType()) { case OLAP: OlapTable olapTable = (OlapTable) tbl; - checkOlapTable(olapTable, tableRef); - if (getContent() == BackupContent.ALL) { - prepareSnapshotTaskForOlapTableWithoutLock(db, (OlapTable) tbl, tableRef, batchTask); + if (!checkOlapTable(olapTable, tableRef).ok()) { + return; + } + if (getContent() == BackupContent.ALL) { + if (!prepareSnapshotTaskForOlapTableWithoutLock( + db, (OlapTable) tbl, tableRef, batchTask).ok()) { + return; + } + } + if (!prepareBackupMetaForOlapTableWithoutLock(tableRef, olapTable, copiedTables).ok()) { + return; } - prepareBackupMetaForOlapTableWithoutLock(tableRef, olapTable, copiedTables); break; case VIEW: - prepareBackupMetaForViewWithoutLock((View) tbl, copiedTables); + if (!prepareBackupMetaForViewWithoutLock((View) tbl, copiedTables).ok()) { + return; + } break; case ODBC: - prepareBackupMetaForOdbcTableWithoutLock((OdbcTable) tbl, copiedTables, copiedResources); + if (!prepareBackupMetaForOdbcTableWithoutLock( + (OdbcTable) tbl, copiedTables, copiedResources).ok()) { + return; + } break; default: status = new Status(ErrCode.COMMON_ERROR, @@ -413,7 +425,7 @@ public class BackupJob extends AbstractJob { LOG.info("finished to send snapshot tasks to backend. {}", this); } - private void checkOlapTable(OlapTable olapTable, TableRef backupTableRef) { + private Status checkOlapTable(OlapTable olapTable, TableRef backupTableRef) { olapTable.readLock(); try { // check backup table again @@ -423,16 +435,17 @@ public class BackupJob extends AbstractJob { if (partition == null) { status = new Status(ErrCode.NOT_FOUND, "partition " + partName + " does not exist in table" + backupTableRef.getName().getTbl()); - return; + return status; } } } } finally { olapTable.readUnlock(); } + return Status.OK; } - private void prepareSnapshotTaskForOlapTableWithoutLock(Database db, OlapTable olapTable, + private Status prepareSnapshotTaskForOlapTableWithoutLock(Database db, OlapTable olapTable, TableRef backupTableRef, AgentBatchTask batchTask) { // Add barrier editolog for barrier commit seq long dbId = db.getId(); @@ -452,7 +465,7 @@ public class BackupJob extends AbstractJob { if (partition == null) { status = new Status(ErrCode.NOT_FOUND, "partition " + partName + " does not exist in table" + backupTableRef.getName().getTbl()); - return; + return status; } } } @@ -481,7 +494,7 @@ public class BackupJob extends AbstractJob { status = new Status(ErrCode.COMMON_ERROR, "failed to choose replica to make snapshot for tablet " + tablet.getId() + ". visible version: " + visibleVersion); - return; + return status; } SnapshotTask task = new SnapshotTask(null, replica.getBackendId(), tablet.getId(), jobId, dbId, olapTable.getId(), partition.getId(), @@ -496,6 +509,7 @@ public class BackupJob extends AbstractJob { LOG.info("snapshot for partition {}, version: {}", partition.getId(), visibleVersion); } + return Status.OK; } private void checkResourceForOdbcTable(OdbcTable odbcTable) { @@ -511,7 +525,7 @@ public class BackupJob extends AbstractJob { } } - private void prepareBackupMetaForOlapTableWithoutLock(TableRef tableRef, OlapTable olapTable, + private Status prepareBackupMetaForOlapTableWithoutLock(TableRef tableRef, OlapTable olapTable, List copiedTables) { // only copy visible indexes List reservedPartitions = tableRef.getPartitionNames() == null ? null @@ -519,28 +533,30 @@ public class BackupJob extends AbstractJob { OlapTable copiedTbl = olapTable.selectiveCopy(reservedPartitions, IndexExtState.VISIBLE, true); if (copiedTbl == null) { status = new Status(ErrCode.COMMON_ERROR, "failed to copy table: " + olapTable.getName()); - return; + return status; } removeUnsupportProperties(copiedTbl); copiedTables.add(copiedTbl); + return Status.OK; } - private void prepareBackupMetaForViewWithoutLock(View view, List
copiedTables) { + private Status prepareBackupMetaForViewWithoutLock(View view, List
copiedTables) { View copiedView = view.clone(); if (copiedView == null) { status = new Status(ErrCode.COMMON_ERROR, "failed to copy view: " + view.getName()); - return; + return status; } copiedTables.add(copiedView); + return Status.OK; } - private void prepareBackupMetaForOdbcTableWithoutLock(OdbcTable odbcTable, List
copiedTables, + private Status prepareBackupMetaForOdbcTableWithoutLock(OdbcTable odbcTable, List
copiedTables, List copiedResources) { OdbcTable copiedOdbcTable = odbcTable.clone(); if (copiedOdbcTable == null) { status = new Status(ErrCode.COMMON_ERROR, "failed to copy odbc table: " + odbcTable.getName()); - return; + return status; } copiedTables.add(copiedOdbcTable); if (copiedOdbcTable.getOdbcCatalogResourceName() != null) { @@ -550,10 +566,11 @@ public class BackupJob extends AbstractJob { if (copiedResource == null) { status = new Status(ErrCode.COMMON_ERROR, "failed to copy odbc resource: " + resource.getName()); - return; + return status; } copiedResources.add(copiedResource); } + return Status.OK; } private void removeUnsupportProperties(OlapTable tbl) {