[opt](nereids)remove partition & histogram from col stats to reduce memory usage #27885

This commit is contained in:
minghong
2023-12-04 14:52:05 +08:00
committed by GitHub
parent 48935c14e2
commit e80526ee3a
9 changed files with 13 additions and 267 deletions

View File

@ -417,7 +417,6 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta
ColumnStatistic rightStats = cp.right().accept(this, context);
return new ColumnStatisticBuilder(leftStats)
.setNumNulls(StatsMathUtil.maxNonNaN(leftStats.numNulls, rightStats.numNulls))
.setHistogram(null)
.setNdv(2).build();
}
@ -430,7 +429,7 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta
ColumnStatistic columnStatistic = childExprs.get(i).accept(this, context);
maxNull = StatsMathUtil.maxNonNaN(maxNull, columnStatistic.numNulls);
}
return new ColumnStatisticBuilder(firstChild).setNumNulls(maxNull).setNdv(2).setHistogram(null).build();
return new ColumnStatisticBuilder(firstChild).setNumNulls(maxNull).setNdv(2).build();
}
@Override
@ -707,7 +706,6 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta
.setMinValue(0)
.setMaxValue(1)
.setNumNulls(0)
.setHistogram(null)
.setAvgSizeByte(random.getDataType().width())
.setDataSize(random.getDataType().width() * context.getRowCount()).build();
}

View File

@ -40,11 +40,8 @@ import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.functions.Function;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.statistics.Bucket;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
import org.apache.doris.statistics.Histogram;
import org.apache.doris.statistics.HistogramBuilder;
import org.apache.doris.statistics.StatisticRange;
import org.apache.doris.statistics.Statistics;
import org.apache.doris.statistics.StatisticsBuilder;
@ -52,7 +49,6 @@ import org.apache.doris.statistics.StatisticsBuilder;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
@ -180,10 +176,6 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight, EstimationContext context, boolean contains) {
if (statsForLeft.hasHistogram()) {
return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft,
statsForRight.maxValue, context, contains);
}
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr,
statsForRight.maxValue, statsForRight.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
@ -194,10 +186,6 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight, EstimationContext context, boolean contains) {
if (statsForLeft.hasHistogram()) {
return estimateGreaterThanLiteralWithHistogram(leftExpr, statsForLeft,
statsForRight.minValue, context, contains);
}
StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr,
statsForLeft.maxValue, statsForLeft.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
@ -237,10 +225,6 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
} else {
selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv);
}
if (statsForLeft.hasHistogram()) {
return estimateEqualToWithHistogram(cp.left(), statsForLeft, val, context);
}
Statistics equalStats = context.statistics.withSel(selectivity);
Expression left = cp.left();
equalStats.addColumnStats(left, statsForRight);
@ -569,103 +553,6 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
.addColumnStats(rightExpr, rightColumnStatistic);
}
private Statistics estimateLessThanLiteralWithHistogram(Expression leftExpr, ColumnStatistic leftStats,
double numVal, EstimationContext context, boolean contains) {
Histogram leftHist = leftStats.histogram;
for (int i = 0; i < leftHist.buckets.size(); i++) {
Bucket bucket = leftHist.buckets.get(i);
if (bucket.upper >= numVal && bucket.lower <= numVal) {
double overlapPercentInBucket;
if (numVal == bucket.upper && numVal == bucket.lower) {
if (contains) {
overlapPercentInBucket = 1;
} else {
overlapPercentInBucket = 0;
}
} else {
overlapPercentInBucket = StatsMathUtil.minNonNaN(1, (numVal - bucket.lower)
/ (bucket.upper - bucket.lower));
}
double overlapCountInBucket = overlapPercentInBucket * bucket.count;
double sel = StatsMathUtil.minNonNaN(1, (bucket.preSum + overlapCountInBucket)
/ StatsMathUtil.nonZeroDivisor(context.statistics.getRowCount()));
List<Bucket> updatedBucketList = leftHist.buckets.subList(0, i + 1);
updatedBucketList.add(new Bucket(bucket.lower, numVal, overlapCountInBucket,
bucket.preSum, overlapPercentInBucket * bucket.ndv));
ColumnStatistic columnStatistic = new ColumnStatisticBuilder(leftStats)
.setMaxValue(numVal)
.setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build())
.build();
context.addKeyIfSlot(leftExpr);
return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic);
}
}
return context.statistics.withSel(0);
}
private Statistics estimateGreaterThanLiteralWithHistogram(Expression leftExpr, ColumnStatistic leftStats,
double numVal, EstimationContext context, boolean contains) {
Histogram leftHist = leftStats.histogram;
for (int i = 0; i < leftHist.buckets.size(); i++) {
Bucket bucket = leftHist.buckets.get(i);
if (bucket.upper >= numVal && bucket.lower <= numVal) {
double overlapPercentInBucket;
if (numVal == bucket.upper && numVal == bucket.lower) {
if (contains) {
overlapPercentInBucket = 1;
} else {
overlapPercentInBucket = 0;
}
} else {
overlapPercentInBucket = StatsMathUtil.minNonNaN(1, ((bucket.upper - numVal)
/ (bucket.upper - bucket.lower)));
}
double overlapCountInBucket = overlapPercentInBucket * bucket.count;
double sel = StatsMathUtil.minNonNaN(1,
(leftHist.size() - bucket.preSum - (bucket.count - overlapCountInBucket))
/ context.statistics.getRowCount());
List<Bucket> updatedBucketList = new ArrayList<>();
updatedBucketList.add(new Bucket(numVal, bucket.upper, overlapPercentInBucket * bucket.count,
0, overlapPercentInBucket * bucket.ndv));
updatedBucketList.addAll(leftHist.buckets.subList(i, leftHist.buckets.size()));
ColumnStatistic columnStatistic = new ColumnStatisticBuilder(leftStats)
.setMaxValue(numVal)
.setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build())
.build();
context.addKeyIfSlot(leftExpr);
return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic);
}
}
return context.statistics.withSel(0);
}
private Statistics estimateEqualToWithHistogram(Expression leftExpr, ColumnStatistic leftStats,
double numVal, EstimationContext context) {
Histogram histogram = leftStats.histogram;
double sel = 0;
for (int i = 0; i < histogram.buckets.size(); i++) {
Bucket bucket = histogram.buckets.get(i);
if (bucket.upper >= numVal && bucket.lower <= numVal) {
sel = (bucket.count / bucket.ndv) / histogram.size();
}
}
if (sel == 0) {
return Statistics.zero(context.statistics);
}
ColumnStatistic columnStatistic = new ColumnStatisticBuilder(leftStats)
.setHistogram(null)
.setNdv(1)
.setNumNulls(0)
.setMaxValue(numVal)
.setMinValue(numVal)
.build();
context.addKeyIfSlot(leftExpr);
return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic);
}
@Override
public Statistics visitLike(Like like, EstimationContext context) {
StatisticsBuilder statsBuilder = new StatisticsBuilder(context.statistics);

View File

@ -19,8 +19,6 @@ package org.apache.doris.nereids.stats;
import org.apache.doris.analysis.IntLiteral;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.PartitionType;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
@ -598,31 +596,6 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
}
}
private Histogram getColumnHistogram(TableIf table, String colName) {
// if (totalHistogramMap.get(table.getName() + colName) != null) {
// return totalHistogramMap.get(table.getName() + colName);
// } else if (isPlayNereidsDump) {
// return null;
// } else {
// return Env.getCurrentEnv().getStatisticsCache().getHistogram(table.getId(), colName);
// }
return null;
}
private ColumnStatistic setOlapPartitionInfo(TableIf tableIf, ColumnStatistic colStats) {
if (colStats.partitionIdToColStats.isEmpty()) {
return colStats;
}
if (!(tableIf instanceof OlapTable)) {
return colStats;
}
OlapTable table = (OlapTable) tableIf;
if (table.getPartitionInfo().getType() != PartitionType.UNPARTITIONED) {
colStats = new ColumnStatisticBuilder(colStats).setPartitionInfo(table.getPartitionInfo()).build();
}
return colStats;
}
// TODO: 1. Subtract the pruned partition
// 2. Consider the influence of runtime filter
// 3. Get NDV and column data size from StatisticManger, StatisticManager doesn't support it now.
@ -653,14 +626,6 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
}
if (!cache.isUnKnown) {
rowCount = Math.max(rowCount, cache.count);
cache = setOlapPartitionInfo(table, cache);
Histogram histogram = getColumnHistogram(table, colName);
if (histogram != null) {
ColumnStatisticBuilder columnStatisticBuilder =
new ColumnStatisticBuilder(cache).setHistogram(histogram);
columnStatisticMap.put(slotReference, columnStatisticBuilder.build());
cache = columnStatisticBuilder.build();
}
}
if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) {
columnStatisticMap.put(slotReference, cache);

View File

@ -19,7 +19,6 @@ package org.apache.doris.statistics;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.PartitionInfo;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.datasource.InternalCatalog;
@ -89,21 +88,12 @@ public class ColumnStatistic {
public final LiteralExpr minExpr;
public final LiteralExpr maxExpr;
@SerializedName("histogram")
// assign value when do stats estimation.
public final Histogram histogram;
@SerializedName("partitionIdToColStats")
public final Map<String, ColumnStatistic> partitionIdToColStats = new HashMap<>();
public final String updatedTime;
public final PartitionInfo partitionInfo;
public ColumnStatistic(double count, double ndv, ColumnStatistic original, double avgSizeByte,
double numNulls, double dataSize, double minValue, double maxValue,
LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, Histogram histogram,
String updatedTime, PartitionInfo partitionInfo) {
LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown,
String updatedTime) {
this.count = count;
this.ndv = ndv;
this.original = original;
@ -115,9 +105,7 @@ public class ColumnStatistic {
this.minExpr = minExpr;
this.maxExpr = maxExpr;
this.isUnKnown = isUnKnown;
this.histogram = histogram;
this.updatedTime = updatedTime;
this.partitionInfo = partitionInfo;
}
public static ColumnStatistic fromResultRow(List<ResultRow> resultRows) {
@ -139,7 +127,6 @@ public class ColumnStatistic {
if (columnStatistic == null) {
return ColumnStatistic.UNKNOWN;
}
columnStatistic.partitionIdToColStats.putAll(partitionIdToColStats);
return columnStatistic;
}
@ -242,7 +229,7 @@ public class ColumnStatistic {
public ColumnStatistic updateBySelectivity(double selectivity, double rowCount) {
if (isUnKnown) {
return UNKNOWN;
return this;
}
ColumnStatisticBuilder builder = new ColumnStatisticBuilder(this);
Double rowsAfterFilter = rowCount * selectivity;
@ -329,7 +316,6 @@ public class ColumnStatistic {
statistic.put("MinExpr", minExpr);
statistic.put("MaxExpr", maxExpr);
statistic.put("IsUnKnown", isUnKnown);
statistic.put("Histogram", Histogram.serializeToJson(histogram));
statistic.put("Original", original);
statistic.put("LastUpdatedTime", updatedTime);
return statistic;
@ -379,8 +365,7 @@ public class ColumnStatistic {
null,
null,
stat.getBoolean("IsUnKnown"),
Histogram.deserializeFromJson(stat.getString("Histogram")),
stat.getString("LastUpdatedTime"), null
stat.getString("LastUpdatedTime")
);
}
@ -388,10 +373,6 @@ public class ColumnStatistic {
return Double.isInfinite(maxValue) || Double.isInfinite(minValue);
}
public boolean hasHistogram() {
return histogram != null && histogram != Histogram.UNKNOWN;
}
public double getOriginalNdv() {
if (original != null) {
return original.ndv;
@ -399,16 +380,7 @@ public class ColumnStatistic {
return ndv;
}
// TODO expanded this function to support more cases, help to compute the change of ndv density
public boolean rangeChanged() {
return original != null && (minValue != original.minValue || maxValue != original.maxValue);
}
public boolean isUnKnown() {
return isUnKnown;
}
public void putPartStats(String partId, ColumnStatistic columnStatistic) {
this.partitionIdToColStats.put(partId, columnStatistic);
}
}

View File

@ -18,10 +18,6 @@
package org.apache.doris.statistics;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.PartitionInfo;
import java.util.HashMap;
import java.util.Map;
public class ColumnStatisticBuilder {
private double count;
@ -36,28 +32,13 @@ public class ColumnStatisticBuilder {
private boolean isUnknown;
private Histogram histogram;
private ColumnStatistic original;
private Map<String, ColumnStatistic> partitionIdToColStats = new HashMap<>();
private String updatedTime;
private PartitionInfo partitionInfo;
public ColumnStatisticBuilder() {
}
public PartitionInfo getPartitionInfo() {
return partitionInfo;
}
public ColumnStatisticBuilder setPartitionInfo(PartitionInfo partitionInfo) {
this.partitionInfo = partitionInfo;
return this;
}
public ColumnStatisticBuilder(ColumnStatistic columnStatistic) {
this.count = columnStatistic.count;
this.ndv = columnStatistic.ndv;
@ -69,11 +50,8 @@ public class ColumnStatisticBuilder {
this.minExpr = columnStatistic.minExpr;
this.maxExpr = columnStatistic.maxExpr;
this.isUnknown = columnStatistic.isUnKnown;
this.histogram = columnStatistic.histogram;
this.original = columnStatistic.original;
this.partitionIdToColStats.putAll(columnStatistic.partitionIdToColStats);
this.updatedTime = columnStatistic.updatedTime;
this.partitionInfo = columnStatistic.partitionInfo;
}
public ColumnStatisticBuilder setCount(double count) {
@ -171,15 +149,6 @@ public class ColumnStatisticBuilder {
return isUnknown;
}
public Histogram getHistogram() {
return histogram;
}
public ColumnStatisticBuilder setHistogram(Histogram histogram) {
this.histogram = histogram;
return this;
}
public String getUpdatedTime() {
return updatedTime;
}
@ -194,13 +163,11 @@ public class ColumnStatisticBuilder {
if (original == null && !isUnknown) {
original = new ColumnStatistic(count, ndv, null, avgSizeByte, numNulls,
dataSize, minValue, maxValue, minExpr, maxExpr,
isUnknown, histogram, updatedTime, partitionInfo);
original.partitionIdToColStats.putAll(partitionIdToColStats);
isUnknown, updatedTime);
}
ColumnStatistic colStats = new ColumnStatistic(count, ndv, original, avgSizeByte, numNulls,
dataSize, minValue, maxValue, minExpr, maxExpr,
isUnknown, histogram, updatedTime, partitionInfo);
colStats.partitionIdToColStats.putAll(partitionIdToColStats);
isUnknown, updatedTime);
return colStats;
}
}

View File

@ -39,11 +39,9 @@ import org.apache.logging.log4j.Logger;
import java.time.Duration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
@ -205,11 +203,6 @@ public class StatisticsCache {
LOG.warn("Error when preheating stats cache", t);
}
}
try {
loadPartStats(keyToColStats);
} catch (Exception e) {
LOG.warn("Fucka", e);
}
}
/**
@ -262,40 +255,4 @@ public class StatisticsCache {
f.obtrudeValue(Optional.of(c));
columnStatisticsCache.put(k, f);
}
private void loadPartStats(Map<StatisticsCacheKey, ColumnStatistic> keyToColStats) {
final int batchSize = Config.expr_children_limit;
Set<StatisticsCacheKey> keySet = new HashSet<>();
for (StatisticsCacheKey statisticsCacheKey : keyToColStats.keySet()) {
if (keySet.size() < batchSize - 1) {
keySet.add(statisticsCacheKey);
} else {
List<ResultRow> partStats = StatisticsRepository.loadPartStats(keySet);
addPartStatsToColStats(keyToColStats, partStats);
keySet = new HashSet<>();
}
}
if (!keySet.isEmpty()) {
List<ResultRow> partStats = StatisticsRepository.loadPartStats(keySet);
addPartStatsToColStats(keyToColStats, partStats);
}
}
private void addPartStatsToColStats(Map<StatisticsCacheKey, ColumnStatistic> keyToColStats,
List<ResultRow> partsStats) {
for (ResultRow r : partsStats) {
try {
StatsId statsId = new StatsId(r);
long tblId = statsId.tblId;
long idxId = statsId.idxId;
String partId = statsId.partId;
String colId = statsId.colId;
ColumnStatistic partStats = ColumnStatistic.fromResultRow(r);
keyToColStats.get(new StatisticsCacheKey(tblId, idxId, colId)).putPartStats(partId, partStats);
} catch (Throwable t) {
LOG.warn("Failed to deserialized part stats", t);
}
}
}
}

View File

@ -361,8 +361,8 @@ public class HyperGraphBuilder {
for (Slot slot : scanPlan.getOutput()) {
slotIdToColumnStats.put(slot,
new ColumnStatistic(count, count, null, 1, 0, 0, 0,
count, null, null, true, null,
new Date().toString(), null));
count, null, null, true,
new Date().toString()));
}
return new Statistics(count, slotIdToColumnStats);
}

View File

@ -241,8 +241,8 @@ public class CacheTest extends TestWithFeService {
public Optional<ColumnStatistic> getColumnStatistic(String colName) {
return Optional.of(new ColumnStatistic(1, 2,
null, 3, 4, 5, 6, 7,
null, null, false, null,
new Date().toString(), null));
null, null, false,
new Date().toString()));
}
};

View File

@ -29,8 +29,8 @@ public class StatsDeriveResultTest {
public void testUpdateRowCountByLimit() {
StatsDeriveResult stats = new StatsDeriveResult(100);
ColumnStatistic a = new ColumnStatistic(100, 10, null, 1, 5, 10,
1, 100, null, null, false, null,
new Date().toString(), null);
1, 100, null, null, false,
new Date().toString());
Id id = new Id(1);
stats.addColumnStats(id, a);
StatsDeriveResult res = stats.updateByLimit(0);