diff --git a/docs/en/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md b/docs/en/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md index 737c117a37..ed8400f152 100644 --- a/docs/en/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md +++ b/docs/en/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md @@ -34,7 +34,7 @@ ADMIN SET REPLICA STATUS This statement is used to set the state of the specified replica. -This command is currently only used to manually set the status of certain replicas to BAD or OK, allowing the system to automatically repair these replicas +This command is currently only used to manually set the status of certain replicas to BAD, DROP or OK, allowing the system to automatically repair these replicas grammar: @@ -47,13 +47,15 @@ ADMIN SET REPLICA STATUS 1. "tablet_id": Required. Specify a Tablet Id. 2. "backend_id": Required. Specify Backend Id. -3. "status": Required. Specifies the state. Currently only "bad" or "ok" are supported +3. "status": Required. Specifies the state. Currently only "bad", "drop" or "ok" are supported If the specified replica does not exist, or the status is already bad, it will be ignored. > Note: > -> The copy set to Bad status may be deleted immediately, please proceed with caution. +> A replica set to Bad, it will not be able to read or write. In addition, sometimes the setting may not working, when be report tablet ok, fe will auto change its status to ok. This operation may delete the replica immediately, so please operate with caution. +> +> A replica set to Drop, it can still be read and written. A healthy replica will be added to other machines first, and then this replica will be deleted. Compared with setting Bad, setting Drop is safer. ### Example @@ -63,7 +65,14 @@ If the specified replica does not exist, or the status is already bad, it will b ADMIN SET REPLICA STATUS PROPERTIES("tablet_id" = "10003", "backend_id" = "10001", "status" = "bad"); ```` -2. Set the replica status of tablet 10003 on BE 10001 to ok. + 2. Set the replica status of tablet 10003 on BE 10001 to drop. + + ```sql + ADMIN SET REPLICA STATUS PROPERTIES("tablet_id" = "10003", "backend_id" = "10001", "status" = "drop"); + ```` + + + 3. Set the replica status of tablet 10003 on BE 10001 to ok. ```sql ADMIN SET REPLICA STATUS PROPERTIES("tablet_id" = "10003", "backend_id" = "10001", "status" = "ok"); diff --git a/docs/zh-CN/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md b/docs/zh-CN/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md index 2ca2759433..d3094e1db2 100644 --- a/docs/zh-CN/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md +++ b/docs/zh-CN/docs/sql-manual/sql-reference/Database-Administration-Statements/ADMIN-SET-REPLICA-STATUS.md @@ -34,7 +34,7 @@ ADMIN SET REPLICA STATUS 该语句用于设置指定副本的状态。 -该命令目前仅用于手动将某些副本状态设置为 BAD 或 OK,从而使得系统能够自动修复这些副本 +该命令目前仅用于手动将某些副本状态设置为 BAD 、DROP 和 OK,从而使得系统能够自动修复这些副本 语法: @@ -47,13 +47,15 @@ ADMIN SET REPLICA STATUS 1. "tablet_id":必需。指定一个 Tablet Id. 2. "backend_id":必需。指定 Backend Id. -3. "status":必需。指定状态。当前仅支持 "bad" 或 "ok" +3. "status":必需。指定状态。当前仅支持 "drop"、"bad"、 "ok" 如果指定的副本不存在,或状态已经是 bad,则会被忽略。 > 注意: > -> 设置为 Bad 状态的副本可能立刻被删除,请谨慎操作。 +> 设置为 Bad 状态的副本,它将不能读写。另外,设置 Bad 有时是不生效的。如果该副本实际数据是正确的,当 BE 上报该副本状态是 ok 的,fe 将把副本自动恢复回ok状态。操作可能立刻删除该副本,请谨慎操作。 +> +> 设置为 Drop 状态的副本,它仍然可以读写。会在其他机器先增加一个健康副本,再删除该副本。相比设置Bad, 设置Drop的操作是安全的。 ### Example @@ -63,7 +65,13 @@ ADMIN SET REPLICA STATUS ADMIN SET REPLICA STATUS PROPERTIES("tablet_id" = "10003", "backend_id" = "10001", "status" = "bad"); ``` -2. 设置 tablet 10003 在 BE 10001 上的副本状态为 ok。 + 2. 设置 tablet 10003 在 BE 10001 上的副本状态为 drop。 + +```sql +ADMIN SET REPLICA STATUS PROPERTIES("tablet_id" = "10003", "backend_id" = "10001", "status" = "drop"); +``` + + 3. 设置 tablet 10003 在 BE 10001 上的副本状态为 ok。 ```sql ADMIN SET REPLICA STATUS PROPERTIES("tablet_id" = "10003", "backend_id" = "10001", "status" = "ok"); diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 27065b1393..fe74d647d3 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1064,6 +1064,11 @@ public class Config extends ConfigBase { @ConfField(masterOnly = true, mutable = true) public static int balance_slot_num_per_path = 1; + // when execute admin set replica status = 'drop', the replica will marked as user drop. + // will try to drop this replica within time not exceeds manual_drop_replica_valid_second + @ConfField(masterOnly = true, mutable = true) + public static long manual_drop_replica_valid_second = 24 * 3600L; + // This threshold is to avoid piling up too many report task in FE, which may cause OOM exception. // In some large Doris cluster, eg: 100 Backends with ten million replicas, a tablet report may cost // several seconds after some modification of metadata(drop partition, etc..). diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AdminSetReplicaStatusStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AdminSetReplicaStatusStmt.java index dfb6b38bed..84288dabd6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AdminSetReplicaStatusStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AdminSetReplicaStatusStmt.java @@ -33,7 +33,7 @@ import java.util.Map; * Required: * "tablet_id" = "10010", * "backend_id" = "10001" - * "status" = "bad"/"ok" + * "status" = "drop"/"bad"/"ok" */ public class AdminSetReplicaStatusStmt extends DdlStmt { @@ -81,7 +81,7 @@ public class AdminSetReplicaStatusStmt extends DdlStmt { } } else if (key.equalsIgnoreCase(STATUS)) { status = ReplicaStatus.valueOf(val.toUpperCase()); - if (status != ReplicaStatus.BAD && status != ReplicaStatus.OK) { + if (status != ReplicaStatus.BAD && status != ReplicaStatus.OK && status != ReplicaStatus.DROP) { throw new AnalysisException("Do not support setting replica status as " + val); } } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowReplicaStatusStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowReplicaStatusStmt.java index 844f979792..2a1f96fed9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowReplicaStatusStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowReplicaStatusStmt.java @@ -40,7 +40,7 @@ public class ShowReplicaStatusStmt extends ShowStmt { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() .add("TabletId").add("ReplicaId").add("BackendId").add("Version").add("LastFailedVersion") .add("LastSuccessVersion").add("CommittedVersion").add("SchemaHash").add("VersionNum") - .add("IsBad").add("State").add("Status") + .add("IsBad").add("IsUserDrop").add("State").add("Status") .build(); private TableRef tblRef; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index ac056cd4b8..730535813e 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -5570,6 +5570,7 @@ public class Env { throw new MetaNotFoundException("replica does not exist on backend, beId=" + backendId); } if (status == ReplicaStatus.BAD || status == ReplicaStatus.OK) { + replica.setUserDrop(false); if (replica.setBad(status == ReplicaStatus.BAD)) { if (!isReplay) { SetReplicaStatusOperationLog log = new SetReplicaStatusOperationLog(backendId, tabletId, @@ -5579,6 +5580,10 @@ public class Env { LOG.info("set replica {} of tablet {} on backend {} as {}. is replay: {}", replica.getId(), tabletId, backendId, status, isReplay); } + } else if (status == ReplicaStatus.DROP) { + replica.setUserDrop(true); + LOG.info("set replica {} of tablet {} on backend {} as {}.", replica.getId(), + tabletId, backendId, status); } } finally { table.writeUnlock(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/MetadataViewer.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/MetadataViewer.java index 6805333cc2..33f0d9f9a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/MetadataViewer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/MetadataViewer.java @@ -93,6 +93,8 @@ public class MetadataViewer { } else if (replica.getSchemaHash() != -1 && replica.getSchemaHash() != schemaHash) { status = ReplicaStatus.SCHEMA_ERROR; + } else if (replica.isUserDrop()) { + status = ReplicaStatus.DROP; } if (filterReplica(status, statusFilter, op)) { @@ -109,6 +111,7 @@ public class MetadataViewer { row.add(String.valueOf(replica.getSchemaHash())); row.add(String.valueOf(replica.getVersionCount())); row.add(String.valueOf(replica.isBad())); + row.add(String.valueOf(replica.isUserDrop())); row.add(replica.getState().name()); row.add(status.name()); result.add(row); @@ -131,6 +134,7 @@ public class MetadataViewer { row.add("-1"); row.add(FeConstants.null_string); row.add(FeConstants.null_string); + row.add(FeConstants.null_string); row.add(ReplicaStatus.MISSING.name()); result.add(row); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Replica.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Replica.java index 66d84d117f..8d54d31a3f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Replica.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Replica.java @@ -70,7 +70,8 @@ public class Replica implements Writable { VERSION_ERROR, // missing version MISSING, // replica does not exist SCHEMA_ERROR, // replica's schema hash does not equal to index's schema hash - BAD // replica is broken. + BAD, // replica is broken. + DROP, // user force drop replica on this backend } @SerializedName(value = "id") @@ -159,6 +160,8 @@ public class Replica implements Writable { private long preWatermarkTxnId = -1; private long postWatermarkTxnId = -1; + private long userDropTime = -1; + public Replica() { } @@ -760,9 +763,33 @@ public class Replica implements Writable { return postWatermarkTxnId; } + public void setUserDrop(boolean isDrop) { + if (isDrop) { + userDropTime = System.currentTimeMillis(); + } else { + userDropTime = -1; + } + } + public boolean isAlive() { return getState() != ReplicaState.CLONE && getState() != ReplicaState.DECOMMISSION && !isBad(); } + + public boolean isUserDrop() { + if (userDropTime > 0) { + if (System.currentTimeMillis() - userDropTime < Config.manual_drop_replica_valid_second * 1000L) { + return true; + } + userDropTime = -1; + } + + return false; + } + + public boolean isScheduleAvailable() { + return Env.getCurrentSystemInfo().checkBackendScheduleAvailable(backendId) + && !isUserDrop(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java index 3ec3bbeef5..708334bb35 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java @@ -498,7 +498,7 @@ public class Tablet extends MetaObject implements Writable { aliveAndVersionComplete++; } - if (backend.isScheduleAvailable()) { + if (replica.isScheduleAvailable()) { if (replica.needFurtherRepair() && (needFurtherRepairReplica == null || !versionCompleted)) { needFurtherRepairReplica = replica; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java index e37ba3315d..67612c037b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java @@ -690,10 +690,14 @@ public class TabletSchedCtx implements Comparable { continue; } - Backend be = infoService.getBackend(replica.getBackendId()); - if (be == null || !be.isScheduleAvailable()) { - LOG.debug("replica's backend {} does not exist or is not scheduler available, skip. tablet: {}", - replica.getBackendId(), tabletId); + if (!replica.isScheduleAvailable()) { + if (Env.getCurrentSystemInfo().checkBackendScheduleAvailable(replica.getBackendId())) { + LOG.debug("replica's backend {} does not exist or is not scheduler available, skip. tablet: {}", + replica.getBackendId(), tabletId); + } else { + LOG.debug("user drop replica {}, skip. tablet: {}", + replica, tabletId); + } continue; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java index 59c62f7190..3f09a65b45 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java @@ -734,7 +734,7 @@ public class TabletScheduler extends MasterDaemon { Map currentAllocMap = Maps.newHashMap(); for (Replica replica : replicas) { Backend be = infoService.getBackend(replica.getBackendId()); - if (be != null && be.isScheduleAvailable() && replica.isAlive() && !replica.tooSlow() + if (replica.isScheduleAvailable() && replica.isAlive() && !replica.tooSlow() && be.isMixNode()) { Short num = currentAllocMap.getOrDefault(be.getLocationTag(), (short) 0); currentAllocMap.put(be.getLocationTag(), (short) (num + 1)); @@ -888,8 +888,9 @@ public class TabletScheduler extends MasterDaemon { // this case should be handled in deleteBackendDropped() continue; } - if (!be.isScheduleAvailable()) { - deleteReplicaInternal(tabletCtx, replica, "backend unavailable", force); + if (!replica.isScheduleAvailable()) { + String reason = be.isScheduleAvailable() ? "backend unavailable" : "user drop replica"; + deleteReplicaInternal(tabletCtx, replica, reason, force); return true; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/ReplicasProcNode.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/ReplicasProcNode.java index 383ba14ab0..56ebbd3d83 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/ReplicasProcNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/ReplicasProcNode.java @@ -42,6 +42,7 @@ public class ReplicasProcNode implements ProcNodeInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder().add("ReplicaId") .add("BackendId").add("Version").add("LstSuccessVersion").add("LstFailedVersion").add("LstFailedTime") .add("SchemaHash").add("LocalDataSize").add("RemoteDataSize").add("RowCount").add("State").add("IsBad") + .add("IsUserDrop") .add("VersionCount").add("PathHash").add("MetaUrl").add("CompactionStatus").add("CooldownReplicaId") .add("CooldownMetaId").add("QueryHits").build(); @@ -103,6 +104,7 @@ public class ReplicasProcNode implements ProcNodeInterface { String.valueOf(replica.getRowCount()), String.valueOf(replica.getState()), String.valueOf(replica.isBad()), + String.valueOf(replica.isUserDrop()), String.valueOf(replica.getVersionCount()), String.valueOf(replica.getPathHash()), metaUrl, diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java index fc0cfe08ea..2b084fcfa2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java @@ -149,9 +149,10 @@ public class Diagnoser { + replica.getLastFailedVersion()); } // status - if (!replica.isAlive()) { + if (!replica.isAlive() || replica.isUserDrop()) { statusErr.append("Replica on backend " + replica.getBackendId() + "'s state is " + replica.getState() - + ", and is bad: " + (replica.isBad() ? "Yes" : "No")); + + ", and is bad: " + (replica.isBad() ? "Yes" : "No") + + ", and is going to drop: " + (replica.isUserDrop() ? "Yes" : "No")); } if (replica.getVersionCount() > Config.min_version_count_indicate_replica_compaction_too_slow) { compactionErr.append("Replica on backend " + replica.getBackendId() + "'s version count is too high: " diff --git a/regression-test/suites/tablet_scheduler_p2/test_set_replica_drop.groovy b/regression-test/suites/tablet_scheduler_p2/test_set_replica_drop.groovy new file mode 100644 index 0000000000..7460a82cdf --- /dev/null +++ b/regression-test/suites/tablet_scheduler_p2/test_set_replica_drop.groovy @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite('test_set_replica_drop') { + def forceReplicaNum = getFeConfig('force_olap_table_replication_num') as int + if (forceReplicaNum > 0 && forceReplicaNum != 1) { + return + } + + def backends = sql 'show backends' + if (backends.size() < 2) { + return + } + + def config = [ + disable_balance : true, + schedule_slot_num_per_ssd_path : 1000, + schedule_slot_num_per_hdd_path : 1000, + schedule_batch_size: 1000, + ] + + setFeConfigTemporary(config) { + def tbl = 'test_set_replica_drop' + sql "DROP TABLE IF EXISTS ${tbl} FORCE" + sql "CREATE TABLE ${tbl} (k int) DISTRIBUTED BY HASH(k) BUCKETS 1 PROPERTIES ('replication_num' = '1')" + sql "INSERT INTO ${tbl} VALUES (1), (2), (3)" + sql "SELECT * FROM ${tbl}" + + def tablets = sql_return_maparray "SHOW TABLETS FROM ${tbl}" + assertEquals(1, tablets.size()) + def oldTablet = tablets[0] + + sql """ + ADMIN SET REPLICA STATUS PROPERTIES( + 'tablet_id' = '${oldTablet.TabletId}', + 'backend_id' = '${oldTablet.BackendId}', + 'status' = 'drop' + )""" + + def maxWaitSeconds = 300 + for (def i = 0; i < maxWaitSeconds; i++) { + sql "SELECT * FROM ${tbl}" + def ok = true + def end = i == maxWaitSeconds - 1 + tablets = sql_return_maparray "SHOW TABLETS FROM ${tbl}" + if (tablets.size() != 1) { + ok = false + if (end) { + assertEquals(1, tablets.size()) + } + } else { + def newTablet = tablets[0] + if (newTablet.BackendId == oldTablet.BackendId) { + ok = false + if (end) { + assertTrue(newTablet.BackendId != oldTablet.BackendId) + } + } + } + + if (ok) { + break + } else { + sleep 1000 + } + } + + sql "DROP TABLE IF EXISTS ${tbl} FORCE" + } +}