[opt](load)change load_to_single_tablet tablet search algorithm from random to round-robin (#25256)
At present, `load_to_singlt_tablet` import implementation refers to simple random number remainder, which cannot achieve true averaging. This will lead to uneven disk IO and uneven use of cluster resources. To solve this problem, we are preparing to implement round-robin for each partition tablet imported each time, in order to achieve average load to each tablet. When generating the load query plan, the tablet index record currently imported is passed to BE. Add a deamon task in FE to regularly clean up the `loadTabletRecordMap`. The map will get the bucket_number of the partition and update the `load_tablet_index` when `getCurrentLoadTabletIndex`.
This commit is contained in:
@ -207,6 +207,7 @@ import org.apache.doris.persist.TruncateTableInfo;
|
||||
import org.apache.doris.persist.meta.MetaHeader;
|
||||
import org.apache.doris.persist.meta.MetaReader;
|
||||
import org.apache.doris.persist.meta.MetaWriter;
|
||||
import org.apache.doris.planner.SingleTabletLoadRecorderMgr;
|
||||
import org.apache.doris.plugin.PluginInfo;
|
||||
import org.apache.doris.plugin.PluginMgr;
|
||||
import org.apache.doris.policy.PolicyMgr;
|
||||
@ -327,6 +328,7 @@ public class Env {
|
||||
private LoadManager loadManager;
|
||||
private ProgressManager progressManager;
|
||||
private StreamLoadRecordMgr streamLoadRecordMgr;
|
||||
private SingleTabletLoadRecorderMgr singleTabletLoadRecorderMgr;
|
||||
private RoutineLoadManager routineLoadManager;
|
||||
private SqlBlockRuleMgr sqlBlockRuleMgr;
|
||||
private ExportMgr exportMgr;
|
||||
@ -689,6 +691,7 @@ public class Env {
|
||||
this.progressManager = new ProgressManager();
|
||||
this.streamLoadRecordMgr = new StreamLoadRecordMgr("stream_load_record_manager",
|
||||
Config.fetch_stream_load_record_interval_second * 1000L);
|
||||
this.singleTabletLoadRecorderMgr = new SingleTabletLoadRecorderMgr();
|
||||
this.loadEtlChecker = new LoadEtlChecker(loadManager);
|
||||
this.loadLoadingChecker = new LoadLoadingChecker(loadManager);
|
||||
this.routineLoadScheduler = new RoutineLoadScheduler(routineLoadManager);
|
||||
@ -1554,6 +1557,7 @@ public class Env {
|
||||
cooldownConfHandler.start();
|
||||
}
|
||||
streamLoadRecordMgr.start();
|
||||
singleTabletLoadRecorderMgr.start();
|
||||
getInternalCatalog().getIcebergTableCreationRecordMgr().start();
|
||||
new InternalSchemaInitializer().start();
|
||||
if (Config.enable_hms_events_incremental_sync) {
|
||||
@ -3778,6 +3782,10 @@ public class Env {
|
||||
return streamLoadRecordMgr;
|
||||
}
|
||||
|
||||
public SingleTabletLoadRecorderMgr getSingleTabletLoadRecorderMgr() {
|
||||
return singleTabletLoadRecorderMgr;
|
||||
}
|
||||
|
||||
public IcebergTableCreationRecordMgr getIcebergTableCreationRecordMgr() {
|
||||
return getInternalCatalog().getIcebergTableCreationRecordMgr();
|
||||
}
|
||||
|
||||
@ -109,6 +109,8 @@ public class OlapTableSink extends DataSink {
|
||||
|
||||
private boolean isStrictMode = false;
|
||||
|
||||
private boolean loadToSingleTablet;
|
||||
|
||||
public OlapTableSink(OlapTable dstTable, TupleDescriptor tupleDescriptor, List<Long> partitionIds,
|
||||
boolean singleReplicaLoad) {
|
||||
this.dstTable = dstTable;
|
||||
@ -131,6 +133,7 @@ public class OlapTableSink extends DataSink {
|
||||
"if load_to_single_tablet set to true," + " the olap table must be with random distribution");
|
||||
}
|
||||
tSink.setLoadToSingleTablet(loadToSingleTablet);
|
||||
this.loadToSingleTablet = loadToSingleTablet;
|
||||
tDataSink = new TDataSink(getDataSinkType());
|
||||
tDataSink.setOlapTableSink(tSink);
|
||||
|
||||
@ -330,6 +333,11 @@ public class OlapTableSink extends DataSink {
|
||||
tPartition.setNumBuckets(index.getTablets().size());
|
||||
}
|
||||
tPartition.setIsMutable(table.getPartitionInfo().getIsMutable(partitionId));
|
||||
if (loadToSingleTablet) {
|
||||
int tabletIndex = Env.getCurrentEnv().getSingleTabletLoadRecorderMgr()
|
||||
.getCurrentLoadTabletIndex(dbId, table.getId(), partitionId);
|
||||
tPartition.setLoadTabletIdx(tabletIndex);
|
||||
}
|
||||
partitionParam.addToPartitions(tPartition);
|
||||
|
||||
DistributionInfo distInfo = partition.getDistributionInfo();
|
||||
@ -384,6 +392,11 @@ public class OlapTableSink extends DataSink {
|
||||
index.getTablets().stream().map(Tablet::getId).collect(Collectors.toList()))));
|
||||
tPartition.setNumBuckets(index.getTablets().size());
|
||||
}
|
||||
if (loadToSingleTablet) {
|
||||
int tabletIndex = Env.getCurrentEnv().getSingleTabletLoadRecorderMgr()
|
||||
.getCurrentLoadTabletIndex(dbId, table.getId(), partition.getId());
|
||||
tPartition.setLoadTabletIdx(tabletIndex);
|
||||
}
|
||||
partitionParam.addToPartitions(tPartition);
|
||||
partitionParam.setDistributedColumns(getDistColumns(partition.getDistributionInfo()));
|
||||
partitionParam.setEnableAutomaticPartition(false);
|
||||
|
||||
@ -0,0 +1,112 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.planner;
|
||||
|
||||
import org.apache.doris.catalog.Env;
|
||||
import org.apache.doris.catalog.MaterializedIndex;
|
||||
import org.apache.doris.catalog.OlapTable;
|
||||
import org.apache.doris.catalog.TableIf;
|
||||
import org.apache.doris.common.UserException;
|
||||
import org.apache.doris.common.util.MasterDaemon;
|
||||
|
||||
import lombok.Getter;
|
||||
import org.apache.commons.lang3.tuple.Triple;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class SingleTabletLoadRecorderMgr extends MasterDaemon {
|
||||
private static final Logger LOG = LogManager.getLogger(SingleTabletLoadRecorderMgr.class);
|
||||
private static final long EXPIRY_TIME_INTERVAL_MS = 86400000; // 1 * 24 * 60 * 60 * 1000, 1 days
|
||||
|
||||
// <<db_id, table_id, partition_id> -> load_tablet_record>
|
||||
// 0 =< load_tablet_index < number_buckets
|
||||
private final ConcurrentHashMap<Triple<Long, Long, Long>, TabletUpdateRecord> loadTabletRecordMap =
|
||||
new ConcurrentHashMap<>();
|
||||
|
||||
public SingleTabletLoadRecorderMgr() {
|
||||
super("single_tablet_load_recorder", EXPIRY_TIME_INTERVAL_MS);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void runAfterCatalogReady() {
|
||||
long expiryTime = System.currentTimeMillis() - EXPIRY_TIME_INTERVAL_MS;
|
||||
loadTabletRecordMap.entrySet().removeIf(entry ->
|
||||
entry.getValue().getUpdateTimestamp() < expiryTime
|
||||
);
|
||||
LOG.info("Remove expired load tablet record successfully.");
|
||||
}
|
||||
|
||||
public int getCurrentLoadTabletIndex(long dbId, long tableId, long partitionId) throws UserException {
|
||||
Triple<Long, Long, Long> key = Triple.of(dbId, tableId, partitionId);
|
||||
TabletUpdateRecord record = loadTabletRecordMap.get(key);
|
||||
int numBuckets = -1;
|
||||
if (record == null) {
|
||||
numBuckets = getNumBuckets(dbId, tableId, partitionId);
|
||||
}
|
||||
return createOrUpdateLoadTabletRecord(key, numBuckets);
|
||||
}
|
||||
|
||||
private int getNumBuckets(long dbId, long tableId, long partitionId) throws UserException {
|
||||
OlapTable olapTable = (OlapTable) Env.getCurrentInternalCatalog().getDb(dbId)
|
||||
.flatMap(db -> db.getTable(tableId)).filter(t -> t.getType() == TableIf.TableType.OLAP)
|
||||
.orElse(null);
|
||||
if (olapTable == null) {
|
||||
throw new UserException("Olap table[" + dbId + "." + tableId + "] is not exist.");
|
||||
}
|
||||
return olapTable.getPartition(partitionId)
|
||||
.getMaterializedIndices(MaterializedIndex.IndexExtState.ALL)
|
||||
.get(0).getTablets().size();
|
||||
}
|
||||
|
||||
private int createOrUpdateLoadTabletRecord(Triple<Long, Long, Long> key, int numBuckets) {
|
||||
TabletUpdateRecord record = loadTabletRecordMap.compute(key, (k, existingRecord) -> {
|
||||
if (existingRecord == null) {
|
||||
return new TabletUpdateRecord(0, numBuckets);
|
||||
} else {
|
||||
existingRecord.updateRecord();
|
||||
return existingRecord;
|
||||
}
|
||||
});
|
||||
return record.getTabletIndex();
|
||||
}
|
||||
|
||||
static class TabletUpdateRecord {
|
||||
@Getter
|
||||
// 0 =< load_tablet_index < number_buckets
|
||||
int tabletIndex;
|
||||
int numBuckets;
|
||||
@Getter
|
||||
long updateTimestamp = System.currentTimeMillis();
|
||||
|
||||
TabletUpdateRecord(int tabletIndex, int numBuckets) {
|
||||
this.tabletIndex = tabletIndex;
|
||||
this.numBuckets = numBuckets;
|
||||
}
|
||||
|
||||
public synchronized void updateRecord() {
|
||||
this.tabletIndex = this.tabletIndex + 1 >= numBuckets ? 0 : this.tabletIndex + 1;
|
||||
// To reduce the compute time cost, only update timestamp when index is 0
|
||||
if (this.tabletIndex == 0) {
|
||||
this.updateTimestamp = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user