[improvement](tablet clone) improve tablet balance, scaling speed etc (#22317)

This commit is contained in:
yujun
2023-08-17 22:30:49 +08:00
committed by GitHub
parent 57568ba472
commit 1f19d0db3e
20 changed files with 640 additions and 201 deletions

View File

@ -0,0 +1,174 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.clone;
import org.apache.doris.analysis.AlterSystemStmt;
import org.apache.doris.analysis.CreateDbStmt;
import org.apache.doris.analysis.CreateTableStmt;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.common.Config;
import org.apache.doris.common.ExceptionChecker;
import org.apache.doris.common.FeConstants;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.system.Backend;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.thrift.TDisk;
import org.apache.doris.thrift.TStorageMedium;
import org.apache.doris.utframe.UtFrameUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
public class DecommissionTest {
private static final Logger LOG = LogManager.getLogger(TabletReplicaTooSlowTest.class);
// use a unique dir so that it won't be conflict with other unit test which
// may also start a Mocked Frontend
private static String runningDirBase = "fe";
private static String runningDir = runningDirBase + "/mocked/DecommissionTest/" + UUID.randomUUID() + "/";
private static ConnectContext connectContext;
private static Random random = new Random(System.currentTimeMillis());
private long id = 10086;
private final SystemInfoService systemInfoService = new SystemInfoService();
private final TabletInvertedIndex invertedIndex = new TabletInvertedIndex();
@BeforeClass
public static void beforeClass() throws Exception {
FeConstants.runningUnitTest = true;
System.out.println(runningDir);
FeConstants.runningUnitTest = true;
FeConstants.tablet_checker_interval_ms = 200;
FeConstants.tablet_schedule_interval_ms = 2000;
Config.tablet_repair_delay_factor_second = 1;
Config.enable_round_robin_create_tablet = true;
Config.schedule_slot_num_per_hdd_path = 10000;
Config.max_scheduling_tablets = 10000;
Config.schedule_batch_size = 10000;
Config.disable_balance = true;
// 4 backends:
// 127.0.0.1
// 127.0.0.2
// 127.0.0.3
// 127.0.0.4
UtFrameUtils.createDorisClusterWithMultiTag(runningDir, 4);
List<Backend> backends = Env.getCurrentSystemInfo().getAllBackends();
for (Backend be : backends) {
Map<String, TDisk> backendDisks = Maps.newHashMap();
TDisk tDisk1 = new TDisk();
tDisk1.setRootPath("/home/doris1.HDD");
tDisk1.setDiskTotalCapacity(20000000);
tDisk1.setDataUsedCapacity(1);
tDisk1.setUsed(true);
tDisk1.setDiskAvailableCapacity(tDisk1.disk_total_capacity - tDisk1.data_used_capacity);
tDisk1.setPathHash(random.nextLong());
tDisk1.setStorageMedium(TStorageMedium.HDD);
backendDisks.put(tDisk1.getRootPath(), tDisk1);
TDisk tDisk2 = new TDisk();
tDisk2.setRootPath("/home/doris2.HHD");
tDisk2.setDiskTotalCapacity(20000000);
tDisk2.setDataUsedCapacity(1);
tDisk2.setUsed(true);
tDisk2.setDiskAvailableCapacity(tDisk2.disk_total_capacity - tDisk2.data_used_capacity);
tDisk2.setPathHash(random.nextLong());
tDisk2.setStorageMedium(TStorageMedium.HDD);
backendDisks.put(tDisk2.getRootPath(), tDisk2);
be.updateDisks(backendDisks);
}
connectContext = UtFrameUtils.createDefaultCtx();
// create database
String createDbStmtStr = "create database test;";
CreateDbStmt createDbStmt = (CreateDbStmt) UtFrameUtils.parseAndAnalyzeStmt(createDbStmtStr, connectContext);
Env.getCurrentEnv().createDb(createDbStmt);
}
@AfterClass
public static void tearDown() {
//UtFrameUtils.cleanDorisFeDir(runningDirBase);
}
private static void createTable(String sql) throws Exception {
CreateTableStmt createTableStmt = (CreateTableStmt) UtFrameUtils.parseAndAnalyzeStmt(sql, connectContext);
Env.getCurrentEnv().createTable(createTableStmt);
RebalancerTestUtil.updateReplicaPathHash();
}
@Test
public void testDecommissionBackend() throws Exception {
// test colocate tablet repair
String createStr = "create table test.tbl1\n"
+ "(k1 date, k2 int)\n"
+ "distributed by hash(k2) buckets 2400\n"
+ "properties\n"
+ "(\n"
+ " \"replication_num\" = \"1\"\n"
+ ")";
ExceptionChecker.expectThrowsNoException(() -> createTable(createStr));
int totalReplicaNum = 1 * 2400;
checkBalance(1, totalReplicaNum, 4);
Backend backend = Env.getCurrentSystemInfo().getAllBackends().get(0);
String decommissionStmtStr = "alter system decommission backend \"" + backend.getHost()
+ ":" + backend.getHeartbeatPort() + "\"";
AlterSystemStmt decommissionStmt =
(AlterSystemStmt) UtFrameUtils.parseAndAnalyzeStmt(decommissionStmtStr, connectContext);
Env.getCurrentEnv().getAlterInstance().processAlterCluster(decommissionStmt);
Assert.assertEquals(true, backend.isDecommissioned());
checkBalance(200, totalReplicaNum, 3);
}
void checkBalance(int tryTimes, int totalReplicaNum, int backendNum) throws Exception {
int beReplicaNum = totalReplicaNum / backendNum;
for (int i = 0; i < tryTimes; i++) {
List<Long> backendIds = Env.getCurrentSystemInfo().getAllBackendIds(true);
if (backendNum != backendIds.size() && i != tryTimes - 1) {
Thread.sleep(1000);
continue;
}
List<Integer> tabletNums = Lists.newArrayList();
for (long beId : backendIds) {
tabletNums.add(Env.getCurrentInvertedIndex().getTabletNumByBackendId(beId));
}
Assert.assertEquals("tablet nums = " + tabletNums, backendNum, backendIds.size());
for (int tabletNum : tabletNums) {
Assert.assertEquals("tablet nums = " + tabletNums, beReplicaNum, tabletNum);
}
}
}
}

View File

@ -20,7 +20,6 @@ package org.apache.doris.clone;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.DataProperty;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.HashDistributionInfo;
import org.apache.doris.catalog.KeysType;
@ -36,7 +35,6 @@ import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.datasource.InternalCatalog;
import org.apache.doris.resource.Tag;
import org.apache.doris.system.Backend;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.task.AgentTask;
import org.apache.doris.task.StorageMediaMigrationTask;
@ -60,7 +58,6 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
public class DiskRebalanceTest {
@ -79,10 +76,12 @@ public class DiskRebalanceTest {
private final SystemInfoService systemInfoService = new SystemInfoService();
private final TabletInvertedIndex invertedIndex = new TabletInvertedIndex();
private Map<Tag, LoadStatisticForTag> statisticMap;
private Map<Long, PathSlot> backendsWorkingSlots = Maps.newHashMap();
@Before
public void setUp() throws Exception {
Config.used_capacity_percent_max_diff = 1.0;
Config.balance_slot_num_per_path = 1;
db = new Database(1, "test db");
db.setClusterName(SystemInfoService.DEFAULT_CLUSTER);
new Expectations() {
@ -137,12 +136,19 @@ public class DiskRebalanceTest {
Env.getCurrentGlobalTransactionMgr().isPreviousTransactionsFinished(1, 2, Lists.newArrayList(3L)));
}
private void generateStatisticMap() {
private void generateStatisticsAndPathSlots() {
LoadStatisticForTag loadStatistic = new LoadStatisticForTag(Tag.DEFAULT_BACKEND_TAG, systemInfoService,
invertedIndex);
loadStatistic.init();
statisticMap = Maps.newHashMap();
statisticMap.put(Tag.DEFAULT_BACKEND_TAG, loadStatistic);
backendsWorkingSlots.clear();
for (BackendLoadStatistic beStat : loadStatistic.getSortedBeLoadStats(null)) {
Map<Long, TStorageMedium> paths = Maps.newHashMap();
beStat.getPathStatistics().stream().forEach(
path -> paths.put(path.getPathHash(), path.getStorageMedium()));
backendsWorkingSlots.put(beStat.getBeId(), new PathSlot(paths, beStat.getBeId()));
}
}
private void createPartitionsForTable(OlapTable olapTable, MaterializedIndex index, Long partitionCount) {
@ -187,8 +193,9 @@ public class DiskRebalanceTest {
// case start
Configurator.setLevel("org.apache.doris.clone.DiskRebalancer", Level.DEBUG);
Rebalancer rebalancer = new DiskRebalancer(Env.getCurrentSystemInfo(), Env.getCurrentInvertedIndex());
generateStatisticMap();
generateStatisticsAndPathSlots();
Rebalancer rebalancer = new DiskRebalancer(Env.getCurrentSystemInfo(), Env.getCurrentInvertedIndex(),
backendsWorkingSlots);
rebalancer.updateLoadStatistic(statisticMap);
List<TabletSchedCtx> alternativeTablets = rebalancer.selectAlternativeTablets();
// check alternativeTablets;
@ -229,8 +236,9 @@ public class DiskRebalanceTest {
// case start
Configurator.setLevel("org.apache.doris.clone.DiskRebalancer", Level.DEBUG);
Rebalancer rebalancer = new DiskRebalancer(Env.getCurrentSystemInfo(), Env.getCurrentInvertedIndex());
generateStatisticMap();
generateStatisticsAndPathSlots();
Rebalancer rebalancer = new DiskRebalancer(Env.getCurrentSystemInfo(), Env.getCurrentInvertedIndex(),
backendsWorkingSlots);
rebalancer.updateLoadStatistic(statisticMap);
for (Map.Entry<Tag, LoadStatisticForTag> s : statisticMap.entrySet()) {
if (s.getValue() != null) {
@ -240,16 +248,6 @@ public class DiskRebalanceTest {
List<TabletSchedCtx> alternativeTablets = rebalancer.selectAlternativeTablets();
// check alternativeTablets;
Assert.assertEquals(2, alternativeTablets.size());
Map<Long, PathSlot> backendsWorkingSlots = Maps.newConcurrentMap();
for (Backend be : Env.getCurrentSystemInfo().getAllBackends()) {
if (!backendsWorkingSlots.containsKey(be.getId())) {
List<Long> pathHashes = be.getDisks().values().stream().map(DiskInfo::getPathHash)
.collect(Collectors.toList());
PathSlot slot = new PathSlot(pathHashes, Config.schedule_slot_num_per_path);
backendsWorkingSlots.put(be.getId(), slot);
}
}
for (TabletSchedCtx tabletCtx : alternativeTablets) {
LOG.info("try to schedule tablet {}", tabletCtx.getTabletId());
try {
@ -259,7 +257,7 @@ public class DiskRebalanceTest {
tabletCtx.setSchemaHash(olapTable.getSchemaHashByIndexId(tabletCtx.getIndexId()));
tabletCtx.setTabletStatus(Tablet.TabletStatus.HEALTHY); // rebalance tablet should be healthy first
AgentTask task = rebalancer.createBalanceTask(tabletCtx, backendsWorkingSlots);
AgentTask task = rebalancer.createBalanceTask(tabletCtx);
if (tabletCtx.getTabletSize() == 0) {
Assert.fail("no exception");
} else {

View File

@ -196,7 +196,7 @@ public class RebalanceTest {
@Test
public void testPrioBackends() {
Rebalancer rebalancer = new DiskRebalancer(Env.getCurrentSystemInfo(), Env.getCurrentInvertedIndex());
Rebalancer rebalancer = new DiskRebalancer(Env.getCurrentSystemInfo(), Env.getCurrentInvertedIndex(), null);
// add
{ // CHECKSTYLE IGNORE THIS LINE
List<Backend> backends = Lists.newArrayList();
@ -232,7 +232,7 @@ public class RebalanceTest {
// Call runAfterCatalogReady manually instead of starting daemon thread
TabletSchedulerStat stat = new TabletSchedulerStat();
PartitionRebalancer rebalancer = new PartitionRebalancer(Env.getCurrentSystemInfo(),
Env.getCurrentInvertedIndex());
Env.getCurrentInvertedIndex(), null);
TabletScheduler tabletScheduler = new TabletScheduler(env, systemInfoService, invertedIndex, stat, "");
// The rebalancer inside the scheduler will use this rebalancer, for getToDeleteReplicaId
Deencapsulation.setField(tabletScheduler, "rebalancer", rebalancer);
@ -256,7 +256,7 @@ public class RebalanceTest {
tabletCtx.setTabletStatus(Tablet.TabletStatus.HEALTHY); // rebalance tablet should be healthy first
// createCloneReplicaAndTask, create replica will change invertedIndex too.
AgentTask task = rebalancer.createBalanceTask(tabletCtx, tabletScheduler.getBackendsWorkingSlots());
AgentTask task = rebalancer.createBalanceTask(tabletCtx);
batchTask.addTask(task);
} catch (SchedException e) {
LOG.warn("schedule tablet {} failed: {}", tabletCtx.getTabletId(), e.getMessage());

View File

@ -19,6 +19,7 @@ package org.apache.doris.clone;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
@ -32,6 +33,7 @@ import org.apache.doris.thrift.TStorageMedium;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Table;
import java.util.List;
import java.util.Map;
@ -106,4 +108,25 @@ public class RebalancerTestUtil {
invertedIndex.addReplica(tablet.getId(), replica);
});
}
public static void updateReplicaPathHash() {
Table<Long, Long, Replica> replicaMetaTable = Env.getCurrentInvertedIndex().getReplicaMetaTable();
for (Table.Cell<Long, Long, Replica> cell : replicaMetaTable.cellSet()) {
long beId = cell.getColumnKey();
Backend be = Env.getCurrentSystemInfo().getBackend(beId);
if (be == null) {
continue;
}
Replica replica = cell.getValue();
TabletMeta tabletMeta = Env.getCurrentInvertedIndex().getTabletMeta(cell.getRowKey());
ImmutableMap<String, DiskInfo> diskMap = be.getDisks();
for (DiskInfo diskInfo : diskMap.values()) {
if (diskInfo.getStorageMedium() == tabletMeta.getStorageMedium()) {
replica.setPathHash(diskInfo.getPathHash());
break;
}
}
}
}
}

View File

@ -31,18 +31,22 @@ public class RootPathLoadStatisticTest {
@Test
public void test() {
RootPathLoadStatistic usageLow = new RootPathLoadStatistic(0L, "/home/disk1", 12345L, TStorageMedium.HDD, 4096L,
RootPathLoadStatistic usage1 = new RootPathLoadStatistic(0L, "/home/disk1", 12345L, TStorageMedium.HDD, 4096L,
1024L, DiskState.ONLINE);
RootPathLoadStatistic usageHigh = new RootPathLoadStatistic(0L, "/home/disk2", 67890L, TStorageMedium.HDD,
RootPathLoadStatistic usage2 = new RootPathLoadStatistic(0L, "/home/disk2", 67890L, TStorageMedium.HDD,
4096L, 2048L, DiskState.ONLINE);
List<RootPathLoadStatistic> list = Lists.newArrayList();
list.add(usageLow);
list.add(usageHigh);
list.add(usage1);
list.add(usage2);
// low usage should be ahead
Collections.sort(list);
Assert.assertTrue(list.get(0).getPathHash() == usageLow.getPathHash());
Assert.assertTrue(list.get(0).getPathHash() == usage1.getPathHash());
usage1.incrCopingSizeB(2048L);
Collections.sort(list);
Assert.assertTrue(list.get(1).getPathHash() == usage1.getPathHash());
}
}

View File

@ -26,7 +26,6 @@ import org.apache.doris.analysis.DropTableStmt;
import org.apache.doris.catalog.ColocateGroupSchema;
import org.apache.doris.catalog.ColocateTableIndex;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
@ -36,7 +35,6 @@ import org.apache.doris.catalog.Replica;
import org.apache.doris.catalog.ReplicaAllocation;
import org.apache.doris.catalog.Tablet;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.catalog.TabletMeta;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
import org.apache.doris.common.DdlException;
@ -54,7 +52,6 @@ import org.apache.doris.thrift.TDisk;
import org.apache.doris.thrift.TStorageMedium;
import org.apache.doris.utframe.UtFrameUtils;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Table;
@ -162,7 +159,7 @@ public class TabletRepairAndBalanceTest {
CreateTableStmt createTableStmt = (CreateTableStmt) UtFrameUtils.parseAndAnalyzeStmt(sql, connectContext);
Env.getCurrentEnv().createTable(createTableStmt);
// must set replicas' path hash, or the tablet scheduler won't work
updateReplicaPathHash();
RebalancerTestUtil.updateReplicaPathHash();
}
private static void dropTable(String sql) throws Exception {
@ -170,26 +167,6 @@ public class TabletRepairAndBalanceTest {
Env.getCurrentEnv().dropTable(dropTableStmt);
}
private static void updateReplicaPathHash() {
Table<Long, Long, Replica> replicaMetaTable = Env.getCurrentInvertedIndex().getReplicaMetaTable();
for (Table.Cell<Long, Long, Replica> cell : replicaMetaTable.cellSet()) {
long beId = cell.getColumnKey();
Backend be = Env.getCurrentSystemInfo().getBackend(beId);
if (be == null) {
continue;
}
Replica replica = cell.getValue();
TabletMeta tabletMeta = Env.getCurrentInvertedIndex().getTabletMeta(cell.getRowKey());
ImmutableMap<String, DiskInfo> diskMap = be.getDisks();
for (DiskInfo diskInfo : diskMap.values()) {
if (diskInfo.getStorageMedium() == tabletMeta.getStorageMedium()) {
replica.setPathHash(diskInfo.getPathHash());
break;
}
}
}
}
private static void alterTable(String sql) throws Exception {
AlterTableStmt alterTableStmt = (AlterTableStmt) UtFrameUtils.parseAndAnalyzeStmt(sql, connectContext);
Env.getCurrentEnv().getAlterInstance().processAlterTable(alterTableStmt);
@ -498,7 +475,7 @@ public class TabletRepairAndBalanceTest {
ExceptionChecker.expectThrowsNoException(() -> createTable(createStr6));
OlapTable tbl3 = db.getOlapTableOrDdlException("col_tbl3");
updateReplicaPathHash();
RebalancerTestUtil.updateReplicaPathHash();
// Set one replica's state as DECOMMISSION, see if it can be changed to NORMAL
Tablet oneTablet = null;
Replica oneReplica = null;

View File

@ -114,7 +114,8 @@ public class AgentTaskTest {
// clone
cloneTask =
new CloneTask(backendId1, dbId, tableId, partitionId, indexId1, tabletId1, replicaId1, schemaHash1,
new CloneTask(new TBackend("host2", 8290, 8390), backendId1, dbId, tableId, partitionId,
indexId1, tabletId1, replicaId1, schemaHash1,
Arrays.asList(new TBackend("host1", 8290, 8390)), TStorageMedium.HDD, -1, 3600);
// storageMediaMigrationTask