[fix](alter)(tablet-scheduler) fix unexpected exception with compaction_too_slow message when add rollup for olap table (#10827)
This commit is contained in:
@ -37,6 +37,7 @@ import org.apache.doris.catalog.OlapTable.OlapTableState;
|
||||
import org.apache.doris.catalog.Partition;
|
||||
import org.apache.doris.catalog.PrimitiveType;
|
||||
import org.apache.doris.catalog.Replica;
|
||||
import org.apache.doris.catalog.Replica.ReplicaState;
|
||||
import org.apache.doris.catalog.Table;
|
||||
import org.apache.doris.catalog.Tablet;
|
||||
import org.apache.doris.catalog.TabletInvertedIndex;
|
||||
@ -384,18 +385,19 @@ public class MaterializedViewHandler extends AlterHandler {
|
||||
for (Replica baseReplica : baseReplicas) {
|
||||
long mvReplicaId = catalog.getNextId();
|
||||
long backendId = baseReplica.getBackendId();
|
||||
if (baseReplica.getState() == Replica.ReplicaState.CLONE
|
||||
|| baseReplica.getState() == Replica.ReplicaState.DECOMMISSION
|
||||
if (baseReplica.getState() == ReplicaState.CLONE
|
||||
|| baseReplica.getState() == ReplicaState.DECOMMISSION
|
||||
|| baseReplica.getState() == ReplicaState.COMPACTION_TOO_SLOW
|
||||
|| baseReplica.getLastFailedVersion() > 0) {
|
||||
LOG.info("base replica {} of tablet {} state is {}, and last failed version is {},"
|
||||
+ " skip creating rollup replica", baseReplica.getId(), baseTabletId,
|
||||
baseReplica.getState(), baseReplica.getLastFailedVersion());
|
||||
continue;
|
||||
}
|
||||
Preconditions.checkState(baseReplica.getState() == Replica.ReplicaState.NORMAL,
|
||||
Preconditions.checkState(baseReplica.getState() == ReplicaState.NORMAL,
|
||||
baseReplica.getState());
|
||||
// replica's init state is ALTER, so that tablet report process will ignore its report
|
||||
Replica mvReplica = new Replica(mvReplicaId, backendId, Replica.ReplicaState.ALTER,
|
||||
Replica mvReplica = new Replica(mvReplicaId, backendId, ReplicaState.ALTER,
|
||||
Partition.PARTITION_INIT_VERSION, mvSchemaHash);
|
||||
newTablet.addReplica(mvReplica);
|
||||
healthyReplicaNum++;
|
||||
|
||||
@ -527,28 +527,27 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
|
||||
|
||||
public boolean compactionRecovered() {
|
||||
Replica chosenReplica = null;
|
||||
long maxVersionCount = -1;
|
||||
long minVersionCount = Integer.MAX_VALUE;
|
||||
long maxVersionCount = Integer.MIN_VALUE;
|
||||
for (Replica replica : tablet.getReplicas()) {
|
||||
if (replica.getVersionCount() > maxVersionCount) {
|
||||
maxVersionCount = replica.getVersionCount();
|
||||
chosenReplica = replica;
|
||||
}
|
||||
if (replica.getVersionCount() < minVersionCount) {
|
||||
minVersionCount = replica.getVersionCount();
|
||||
}
|
||||
}
|
||||
boolean recovered = false;
|
||||
for (Replica replica : tablet.getReplicas()) {
|
||||
if (replica.isAlive() && replica.tooSlow() && !chosenReplica.equals(replica)) {
|
||||
chosenReplica.setState(ReplicaState.NORMAL);
|
||||
recovered = true;
|
||||
if (replica.isAlive() && replica.tooSlow() && (!replica.equals(chosenReplica)
|
||||
|| replica.getVersionCount() < Config.min_version_count_indicate_replica_compaction_too_slow)) {
|
||||
if (chosenReplica != null) {
|
||||
chosenReplica.setState(ReplicaState.NORMAL);
|
||||
recovered = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return recovered;
|
||||
}
|
||||
|
||||
// database lock should be held.
|
||||
// table lock should be held.
|
||||
// If exceptBeId != -1, should not choose src replica with same BE id as exceptBeId
|
||||
public void chooseSrcReplica(Map<Long, PathSlot> backendsWorkingSlots, long exceptBeId) throws SchedException {
|
||||
/*
|
||||
|
||||
@ -1071,9 +1071,7 @@ public class TabletScheduler extends MasterDaemon {
|
||||
*/
|
||||
private void handleReplicaTooSlow(TabletSchedCtx tabletCtx) throws SchedException {
|
||||
Replica chosenReplica = null;
|
||||
Replica minReplica = null;
|
||||
long maxVersionCount = -1;
|
||||
long minVersionCount = Integer.MAX_VALUE;
|
||||
int normalReplicaCount = 0;
|
||||
for (Replica replica : tabletCtx.getReplicas()) {
|
||||
if (replica.isAlive() && !replica.tooSlow()) {
|
||||
@ -1083,20 +1081,16 @@ public class TabletScheduler extends MasterDaemon {
|
||||
maxVersionCount = replica.getVersionCount();
|
||||
chosenReplica = replica;
|
||||
}
|
||||
if (replica.getVersionCount() < minVersionCount) {
|
||||
minVersionCount = replica.getVersionCount();
|
||||
minReplica = replica;
|
||||
}
|
||||
}
|
||||
|
||||
if (chosenReplica != null && !chosenReplica.equals(minReplica) && minReplica.isAlive() && !minReplica.tooSlow()
|
||||
&& normalReplicaCount >= 1) {
|
||||
if (chosenReplica != null && chosenReplica.isAlive() && !chosenReplica.tooSlow()
|
||||
&& chosenReplica.getVersionCount() > Config.min_version_count_indicate_replica_compaction_too_slow
|
||||
&& normalReplicaCount - 1 >= tabletCtx.getReplicas().size() / 2 + 1) {
|
||||
chosenReplica.setState(ReplicaState.COMPACTION_TOO_SLOW);
|
||||
LOG.info("set replica id :{} tablet id: {}, backend id: {} to COMPACTION_TOO_SLOW",
|
||||
chosenReplica.getId(), tabletCtx.getTablet().getId(), chosenReplica.getBackendId());
|
||||
throw new SchedException(Status.FINISHED, "set replica to COMPACTION_TOO_SLOW");
|
||||
}
|
||||
throw new SchedException(Status.FINISHED, "No replica too slow");
|
||||
throw new SchedException(Status.FINISHED, "No replica set to COMPACTION_TOO_SLOW");
|
||||
}
|
||||
|
||||
private void deleteReplicaInternal(TabletSchedCtx tabletCtx,
|
||||
|
||||
@ -1587,7 +1587,7 @@ public class Config extends ConfigBase {
|
||||
public static int min_version_count_indicate_replica_compaction_too_slow = 200;
|
||||
|
||||
/**
|
||||
* The valid ratio threshold of the difference between the version count of the slowest replicaand the fastest
|
||||
* The valid ratio threshold of the difference between the version count of the slowest replica and the fastest
|
||||
* replica. If repair_slow_replica is set to true, it is used to determine whether to repair the slowest replica
|
||||
*/
|
||||
@ConfField(mutable = true, masterOnly = true)
|
||||
|
||||
@ -139,9 +139,6 @@ public class TabletReplicaTooSlowTest {
|
||||
Backend be = Catalog.getCurrentSystemInfo().getBackend(beId);
|
||||
List<Long> pathHashes = be.getDisks().values().stream()
|
||||
.map(DiskInfo::getPathHash).collect(Collectors.toList());
|
||||
if (be == null) {
|
||||
continue;
|
||||
}
|
||||
Replica replica = cell.getValue();
|
||||
replica.setVersionCount(versionCount);
|
||||
versionCount = versionCount + 200;
|
||||
|
||||
Reference in New Issue
Block a user