[improvement](query) prefer to chose tablet on alive disk #39467 (#39654)

cherry pick from #39467
This commit is contained in:
yujun
2024-08-23 12:23:12 +08:00
committed by GitHub
parent 1f16daa5f6
commit 0934fbee7e
8 changed files with 190 additions and 21 deletions

View File

@ -151,6 +151,10 @@ public class DiskInfo implements Writable {
return pathHash != 0;
}
public boolean isAlive() {
return state == DiskState.ONLINE;
}
public boolean isStorageMediumMatch(TStorageMedium storageMedium) {
return this.storageMedium == storageMedium;
}

View File

@ -281,9 +281,11 @@ public class Tablet extends MetaObject implements Writable {
}
// for query
public List<Replica> getQueryableReplicas(long visibleVersion, boolean allowFailedVersion) {
public List<Replica> getQueryableReplicas(long visibleVersion, Map<Long, Set<Long>> backendAlivePathHashs,
boolean allowFailedVersion) {
List<Replica> allQueryableReplica = Lists.newArrayListWithCapacity(replicas.size());
List<Replica> auxiliaryReplica = Lists.newArrayListWithCapacity(replicas.size());
List<Replica> deadPathReplica = Lists.newArrayList();
for (Replica replica : replicas) {
if (replica.isBad()) {
continue;
@ -294,21 +296,31 @@ public class Tablet extends MetaObject implements Writable {
continue;
}
if (!replica.checkVersionCatchUp(visibleVersion, false)) {
continue;
}
Set<Long> thisBeAlivePaths = backendAlivePathHashs.get(replica.getBackendId());
ReplicaState state = replica.getState();
if (state.canQuery()) {
if (replica.checkVersionCatchUp(visibleVersion, false)) {
allQueryableReplica.add(replica);
}
// if thisBeAlivePaths contains pathHash = 0, it mean this be hadn't report disks state.
// should ignore this case.
if (replica.getPathHash() != -1 && thisBeAlivePaths != null
&& !thisBeAlivePaths.contains(replica.getPathHash())
&& !thisBeAlivePaths.contains(0L)) {
deadPathReplica.add(replica);
} else if (state.canQuery()) {
allQueryableReplica.add(replica);
} else if (state == ReplicaState.DECOMMISSION) {
if (replica.checkVersionCatchUp(visibleVersion, false)) {
auxiliaryReplica.add(replica);
}
auxiliaryReplica.add(replica);
}
}
if (allQueryableReplica.isEmpty()) {
allQueryableReplica = auxiliaryReplica;
}
if (allQueryableReplica.isEmpty()) {
allQueryableReplica = deadPathReplica;
}
if (Config.skip_compaction_slower_replica && allQueryableReplica.size() > 1) {
long minVersionCount = allQueryableReplica.stream().mapToLong(Replica::getVisibleVersionCount)

View File

@ -22,6 +22,7 @@ import org.apache.doris.catalog.BinlogConfig;
import org.apache.doris.catalog.ColocateGroupSchema;
import org.apache.doris.catalog.ColocateTableIndex;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.MaterializedIndex.IndexState;
@ -808,6 +809,15 @@ public class ReportHandler extends Daemon {
AgentBatchTask createReplicaBatchTask = new AgentBatchTask();
TabletInvertedIndex invertedIndex = Env.getCurrentInvertedIndex();
Map<Object, Object> objectPool = new HashMap<Object, Object>();
Backend backend = Env.getCurrentSystemInfo().getBackend(backendId);
Set<Long> backendHealthPathHashs;
if (backend == null) {
backendHealthPathHashs = Sets.newHashSet();
} else {
backendHealthPathHashs = backend.getDisks().values().stream()
.filter(DiskInfo::isAlive)
.map(DiskInfo::getPathHash).collect(Collectors.toSet());
}
for (Long dbId : tabletDeleteFromMeta.keySet()) {
Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId);
if (db == null) {
@ -863,7 +873,24 @@ public class ReportHandler extends Daemon {
long currentBackendReportVersion = Env.getCurrentSystemInfo()
.getBackendReportVersion(backendId);
if (backendReportVersion < currentBackendReportVersion) {
continue;
// if backendHealthPathHashs contains health path hash 0,
// it means this backend hadn't reported disks state,
// should ignore this case.
boolean thisReplicaOnBadDisk = replica.getPathHash() != -1L
&& !backendHealthPathHashs.contains(replica.getPathHash())
&& !backendHealthPathHashs.contains(0L);
boolean existsOtherHealthReplica = tablet.getReplicas().stream()
.anyMatch(r -> r.getBackendId() != replica.getBackendId()
&& r.getVersion() >= replica.getVersion()
&& r.getLastFailedVersion() == -1L
&& !r.isBad());
// if replica is on bad disks and there are other health replicas, still delete it.
if (!(thisReplicaOnBadDisk && existsOtherHealthReplica)) {
continue;
}
}
BinlogConfig binlogConfig = new BinlogConfig(olapTable.getBinlogConfig());

View File

@ -40,6 +40,7 @@ import org.apache.doris.analysis.TupleId;
import org.apache.doris.catalog.AggregateType;
import org.apache.doris.catalog.ColocateTableIndex;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.DistributionInfo;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.HashDistributionInfo;
@ -732,7 +733,7 @@ public class OlapScanNode extends ScanNode {
}
private void addScanRangeLocations(Partition partition,
List<Tablet> tablets) throws UserException {
List<Tablet> tablets, Map<Long, Set<Long>> backendAlivePathHashs) throws UserException {
long visibleVersion = partition.getVisibleVersion();
String visibleVersionStr = String.valueOf(visibleVersion);
@ -776,7 +777,8 @@ public class OlapScanNode extends ScanNode {
paloRange.setTabletId(tabletId);
// random shuffle List && only collect one copy
List<Replica> replicas = tablet.getQueryableReplicas(visibleVersion, skipMissingVersion);
List<Replica> replicas = tablet.getQueryableReplicas(visibleVersion,
backendAlivePathHashs, skipMissingVersion);
if (replicas.isEmpty()) {
if (ConnectContext.get().getSessionVariable().skipBadTablet) {
continue;
@ -1125,6 +1127,12 @@ public class OlapScanNode extends ScanNode {
*/
Preconditions.checkState(scanBackendIds.size() == 0);
Preconditions.checkState(scanTabletIds.size() == 0);
Map<Long, Set<Long>> backendAlivePathHashs = Maps.newHashMap();
for (Backend backend : Env.getCurrentSystemInfo().getAllBackends()) {
backendAlivePathHashs.put(backend.getId(), backend.getDisks().values().stream()
.filter(DiskInfo::isAlive).map(DiskInfo::getPathHash).collect(Collectors.toSet()));
}
for (Long partitionId : selectedPartitionIds) {
final Partition partition = olapTable.getPartition(partitionId);
final MaterializedIndex selectedTable = partition.getIndex(selectedIndexId);
@ -1166,7 +1174,7 @@ public class OlapScanNode extends ScanNode {
totalTabletsNum += selectedTable.getTablets().size();
selectedSplitNum += tablets.size();
addScanRangeLocations(partition, tablets);
addScanRangeLocations(partition, tablets, backendAlivePathHashs);
}
}