Fix bug that failed to get enough normal replica because path hash is not set. (#1714)

Path Hash of a replica in metadata should be set immediately after replica is created.
And we should not depend on path hash to find replicas. Because path hash may be set
delayed.
This commit is contained in:
Mingyu Chen
2019-08-28 19:37:38 +08:00
committed by GitHub
parent 2159293d23
commit c541c3fd59
8 changed files with 48 additions and 33 deletions

View File

@ -953,7 +953,7 @@ OLAPStatus TabletManager::report_tablet_info(TTabletInfo* tablet_info) {
}
_build_tablet_info(tablet, tablet_info);
LOG(INFO) << "success to process report tablet info.";
VLOG(10) << "success to process report tablet info.";
return res;
} // report_tablet_info
@ -987,13 +987,6 @@ OLAPStatus TabletManager::report_all_tablets_info(std::map<TTabletId, TTablet>*
tablet_ptr->schema_hash(), tablet_ptr->tablet_uid(), &transaction_ids);
tablet_info.__set_transaction_ids(transaction_ids);
if (_available_storage_medium_type_count > 1) {
tablet_info.__set_storage_medium(tablet_ptr->data_dir()->storage_medium());
}
tablet_info.__set_version_count(tablet_ptr->version_count());
tablet_info.__set_path_hash(tablet_ptr->data_dir()->path_hash());
tablet.tablet_infos.push_back(tablet_info);
}
@ -1175,6 +1168,11 @@ void TabletManager::_build_tablet_info(TabletSharedPtr tablet, TTabletInfo* tabl
tablet_info->version = version.second;
tablet_info->version_hash = v_hash;
tablet_info->__set_partition_id(tablet->partition_id());
if (_available_storage_medium_type_count > 1) {
tablet_info->__set_storage_medium(tablet->data_dir()->storage_medium());
}
tablet_info->__set_version_count(tablet->version_count());
tablet_info->__set_path_hash(tablet->data_dir()->path_hash());
}
void TabletManager::_build_tablet_stat() {

View File

@ -595,6 +595,10 @@ public class RollupJob extends AlterJob {
// the version is not set now
rollupReplica.updateVersionInfo(version, versionHash, dataSize, rowCount);
if (finishTabletInfo.isSetPath_hash()) {
rollupReplica.setPathHash(finishTabletInfo.getPath_hash());
}
setReplicaFinished(partitionId, rollupReplicaId);
rollupReplica.setState(ReplicaState.NORMAL);

View File

@ -621,6 +621,9 @@ public class SchemaChangeJob extends AlterJob {
long rowCount = finishTabletInfo.getRow_count();
// do not need check version > replica.getVersion, because the new replica's version is first set by sc
replica.updateVersionInfo(version, versionHash, dataSize, rowCount);
if (finishTabletInfo.isSetPath_hash()) {
replica.setPathHash(finishTabletInfo.getPath_hash());
}
} finally {
db.writeUnlock();
}

View File

@ -67,8 +67,10 @@ import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.Table.Cell;
@ -755,7 +757,7 @@ public class RestoreJob extends AbstractJob {
unfinishedSignatureToId.clear();
taskProgress.clear();
taskErrMsg.clear();
Map<Long, Long> pathBeMap = Maps.newHashMap();
Multimap<Long, Long> bePathsMap = HashMultimap.create();
batchTask = new AgentBatchTask();
db.readLock();
try {
@ -774,14 +776,14 @@ public class RestoreJob extends AbstractJob {
true /* is restore task*/);
batchTask.addTask(task);
unfinishedSignatureToId.put(signature, tablet.getId());
pathBeMap.put(replica.getPathHash(), replica.getBackendId());
bePathsMap.put(replica.getBackendId(), replica.getPathHash());
}
} finally {
db.readUnlock();
}
// check disk capacity
org.apache.doris.common.Status st = Catalog.getCurrentSystemInfo().checkExceedDiskCapacityLimit(pathBeMap, true);
org.apache.doris.common.Status st = Catalog.getCurrentSystemInfo().checkExceedDiskCapacityLimit(bePathsMap, true);
if (!st.ok()) {
status = new Status(ErrCode.COMMON_ERROR, st.getErrorMsg());
return;

View File

@ -26,8 +26,9 @@ import org.apache.doris.common.io.Writable;
import org.apache.doris.system.Backend;
import org.apache.doris.system.SystemInfoService;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import org.apache.logging.log4j.LogManager;
@ -40,7 +41,6 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
@ -186,9 +186,9 @@ public class Tablet extends MetaObject implements Writable {
return beIds;
}
// return map of (path hash -> BE id) of normal replicas
public Map<Long, Long> getNormalReplicaBackendPathMap() {
Map<Long, Long> map = Maps.newHashMap();
// return map of (BE id -> path hash) of normal replicas
public Multimap<Long, Long> getNormalReplicaBackendPathMap() {
Multimap<Long, Long> map = HashMultimap.create();
SystemInfoService infoService = Catalog.getCurrentSystemInfo();
for (Replica replica : replicas) {
if (replica.isBad()) {
@ -198,7 +198,7 @@ public class Tablet extends MetaObject implements Writable {
ReplicaState state = replica.getState();
if (infoService.checkBackendAlive(replica.getBackendId())
&& (state == ReplicaState.NORMAL || state == ReplicaState.SCHEMA_CHANGE)) {
map.put(replica.getPathHash(), replica.getBackendId());
map.put(replica.getBackendId(), replica.getPathHash());
}
}
return map;

View File

@ -876,6 +876,9 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
replica.updateVersionInfo(reportedTablet.getVersion(), reportedTablet.getVersion_hash(),
reportedTablet.getData_size(), reportedTablet.getRow_count());
if (reportedTablet.isSetPath_hash()) {
replica.setPathHash(reportedTablet.getPath_hash());
}
if (this.type == Type.BALANCE) {
long partitionVisibleVersion = partition.getVisibleVersion();

View File

@ -55,8 +55,9 @@ import org.apache.doris.thrift.TUniqueId;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.Sets;
@ -295,26 +296,27 @@ public class OlapTableSink extends DataSink {
private TOlapTableLocationParam createLocation(OlapTable table) throws UserException {
TOlapTableLocationParam locationParam = new TOlapTableLocationParam();
Map<Long, Long> allPathBeMap = Maps.newHashMap();
// BE id -> path hash
Multimap<Long, Long> allBePathsMap = HashMultimap.create();
for (Partition partition : table.getPartitions()) {
int quorum = table.getPartitionInfo().getReplicationNum(partition.getId()) / 2 + 1;
for (MaterializedIndex index : partition.getMaterializedIndices()) {
// we should ensure the replica backend is alive
// otherwise, there will be a 'unknown node id, id=xxx' error for stream load
for (Tablet tablet : index.getTablets()) {
Map<Long, Long> pathBeMap = tablet.getNormalReplicaBackendPathMap();
if (pathBeMap.size() < quorum) {
throw new UserException("tablet " + tablet.getId() + " has few replicas: " + pathBeMap.size());
Multimap<Long, Long> bePathsMap = tablet.getNormalReplicaBackendPathMap();
if (bePathsMap.keySet().size() < quorum) {
throw new UserException("tablet " + tablet.getId() + " has few replicas: " + bePathsMap.keySet().size());
}
locationParam.addToTablets(new TTabletLocation(tablet.getId(), Lists.newArrayList(pathBeMap.values())));
allPathBeMap.putAll(pathBeMap);
locationParam.addToTablets(new TTabletLocation(tablet.getId(), Lists.newArrayList(bePathsMap.keySet())));
allBePathsMap.putAll(bePathsMap);
}
}
}
// check if disk capacity reach limit
// this is for load process, so use high water mark to check
Status st = Catalog.getCurrentSystemInfo().checkExceedDiskCapacityLimit(allPathBeMap, true);
Status st = Catalog.getCurrentSystemInfo().checkExceedDiskCapacityLimit(allBePathsMap, true);
if (!st.ok()) {
throw new DdlException(st.getErrorMsg());
}

View File

@ -36,6 +36,7 @@ import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import org.apache.commons.validator.routines.InetAddressValidator;
@ -1116,19 +1117,21 @@ public class SystemInfoService {
/*
* Check if the specified disks' capacity has reached the limit.
* pathBeMap is (path hash -> BE id)
* bePathsMap is (BE id -> list of path hash)
* If floodStage is true, it will check with the floodStage threshold.
*
* return Status.OK if not reach the limit
*/
public Status checkExceedDiskCapacityLimit(Map<Long, Long> pathBeMap, boolean floodStage) {
LOG.debug("pathBeMap: {}", pathBeMap);
public Status checkExceedDiskCapacityLimit(Multimap<Long, Long> bePathsMap, boolean floodStage) {
LOG.debug("pathBeMap: {}", bePathsMap);
ImmutableMap<Long, DiskInfo> pathHashToDiskInfo = pathHashToDishInfoRef.get();
for (Long pathHash : pathBeMap.keySet()) {
DiskInfo diskInfo = pathHashToDiskInfo.get(pathHash);
if (diskInfo != null && diskInfo.exceedLimit(floodStage)) {
return new Status(TStatusCode.CANCELLED,
"disk " + pathHash + " on backend " + pathBeMap.get(pathHash) + " exceed limit usage");
for (Long beId : bePathsMap.keySet()) {
for (Long pathHash : bePathsMap.get(beId)) {
DiskInfo diskInfo = pathHashToDiskInfo.get(pathHash);
if (diskInfo != null && diskInfo.exceedLimit(floodStage)) {
return new Status(TStatusCode.CANCELLED,
"disk " + pathHash + " on backend " + beId + " exceed limit usage");
}
}
}
return Status.OK;