From e80526ee3a7e715c43fdaa8784cf327d727ee996 Mon Sep 17 00:00:00 2001 From: minghong Date: Mon, 4 Dec 2023 14:52:05 +0800 Subject: [PATCH] [opt](nereids)remove partition & histogram from col stats to reduce memory usage #27885 --- .../nereids/stats/ExpressionEstimation.java | 4 +- .../doris/nereids/stats/FilterEstimation.java | 113 ------------------ .../doris/nereids/stats/StatsCalculator.java | 35 ------ .../doris/statistics/ColumnStatistic.java | 36 +----- .../statistics/ColumnStatisticBuilder.java | 37 +----- .../doris/statistics/StatisticsCache.java | 43 ------- .../doris/nereids/util/HyperGraphBuilder.java | 4 +- .../apache/doris/statistics/CacheTest.java | 4 +- .../statistics/StatsDeriveResultTest.java | 4 +- 9 files changed, 13 insertions(+), 267 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java index 6526f78836..f82f509ba0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java @@ -417,7 +417,6 @@ public class ExpressionEstimation extends ExpressionVisitor= numVal && bucket.lower <= numVal) { - double overlapPercentInBucket; - if (numVal == bucket.upper && numVal == bucket.lower) { - if (contains) { - overlapPercentInBucket = 1; - } else { - overlapPercentInBucket = 0; - } - } else { - overlapPercentInBucket = StatsMathUtil.minNonNaN(1, (numVal - bucket.lower) - / (bucket.upper - bucket.lower)); - } - double overlapCountInBucket = overlapPercentInBucket * bucket.count; - double sel = StatsMathUtil.minNonNaN(1, (bucket.preSum + overlapCountInBucket) - / StatsMathUtil.nonZeroDivisor(context.statistics.getRowCount())); - List updatedBucketList = leftHist.buckets.subList(0, i + 1); - updatedBucketList.add(new Bucket(bucket.lower, numVal, overlapCountInBucket, - bucket.preSum, overlapPercentInBucket * bucket.ndv)); - ColumnStatistic columnStatistic = new ColumnStatisticBuilder(leftStats) - .setMaxValue(numVal) - .setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build()) - .build(); - context.addKeyIfSlot(leftExpr); - return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); - } - } - return context.statistics.withSel(0); - } - - private Statistics estimateGreaterThanLiteralWithHistogram(Expression leftExpr, ColumnStatistic leftStats, - double numVal, EstimationContext context, boolean contains) { - Histogram leftHist = leftStats.histogram; - - for (int i = 0; i < leftHist.buckets.size(); i++) { - Bucket bucket = leftHist.buckets.get(i); - if (bucket.upper >= numVal && bucket.lower <= numVal) { - double overlapPercentInBucket; - if (numVal == bucket.upper && numVal == bucket.lower) { - if (contains) { - overlapPercentInBucket = 1; - } else { - overlapPercentInBucket = 0; - } - } else { - overlapPercentInBucket = StatsMathUtil.minNonNaN(1, ((bucket.upper - numVal) - / (bucket.upper - bucket.lower))); - } - double overlapCountInBucket = overlapPercentInBucket * bucket.count; - double sel = StatsMathUtil.minNonNaN(1, - (leftHist.size() - bucket.preSum - (bucket.count - overlapCountInBucket)) - / context.statistics.getRowCount()); - List updatedBucketList = new ArrayList<>(); - updatedBucketList.add(new Bucket(numVal, bucket.upper, overlapPercentInBucket * bucket.count, - 0, overlapPercentInBucket * bucket.ndv)); - updatedBucketList.addAll(leftHist.buckets.subList(i, leftHist.buckets.size())); - ColumnStatistic columnStatistic = new ColumnStatisticBuilder(leftStats) - .setMaxValue(numVal) - .setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build()) - .build(); - context.addKeyIfSlot(leftExpr); - return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); - } - } - return context.statistics.withSel(0); - } - - private Statistics estimateEqualToWithHistogram(Expression leftExpr, ColumnStatistic leftStats, - double numVal, EstimationContext context) { - Histogram histogram = leftStats.histogram; - - double sel = 0; - for (int i = 0; i < histogram.buckets.size(); i++) { - Bucket bucket = histogram.buckets.get(i); - if (bucket.upper >= numVal && bucket.lower <= numVal) { - sel = (bucket.count / bucket.ndv) / histogram.size(); - } - } - if (sel == 0) { - return Statistics.zero(context.statistics); - } - ColumnStatistic columnStatistic = new ColumnStatisticBuilder(leftStats) - .setHistogram(null) - .setNdv(1) - .setNumNulls(0) - .setMaxValue(numVal) - .setMinValue(numVal) - .build(); - context.addKeyIfSlot(leftExpr); - return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); - } - @Override public Statistics visitLike(Like like, EstimationContext context) { StatisticsBuilder statsBuilder = new StatisticsBuilder(context.statistics); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 5cd650bbb5..8495411745 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -19,8 +19,6 @@ package org.apache.doris.nereids.stats; import org.apache.doris.analysis.IntLiteral; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.PartitionType; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; @@ -598,31 +596,6 @@ public class StatsCalculator extends DefaultPlanVisitor { } } - private Histogram getColumnHistogram(TableIf table, String colName) { - // if (totalHistogramMap.get(table.getName() + colName) != null) { - // return totalHistogramMap.get(table.getName() + colName); - // } else if (isPlayNereidsDump) { - // return null; - // } else { - // return Env.getCurrentEnv().getStatisticsCache().getHistogram(table.getId(), colName); - // } - return null; - } - - private ColumnStatistic setOlapPartitionInfo(TableIf tableIf, ColumnStatistic colStats) { - if (colStats.partitionIdToColStats.isEmpty()) { - return colStats; - } - if (!(tableIf instanceof OlapTable)) { - return colStats; - } - OlapTable table = (OlapTable) tableIf; - if (table.getPartitionInfo().getType() != PartitionType.UNPARTITIONED) { - colStats = new ColumnStatisticBuilder(colStats).setPartitionInfo(table.getPartitionInfo()).build(); - } - return colStats; - } - // TODO: 1. Subtract the pruned partition // 2. Consider the influence of runtime filter // 3. Get NDV and column data size from StatisticManger, StatisticManager doesn't support it now. @@ -653,14 +626,6 @@ public class StatsCalculator extends DefaultPlanVisitor { } if (!cache.isUnKnown) { rowCount = Math.max(rowCount, cache.count); - cache = setOlapPartitionInfo(table, cache); - Histogram histogram = getColumnHistogram(table, colName); - if (histogram != null) { - ColumnStatisticBuilder columnStatisticBuilder = - new ColumnStatisticBuilder(cache).setHistogram(histogram); - columnStatisticMap.put(slotReference, columnStatisticBuilder.build()); - cache = columnStatisticBuilder.build(); - } } if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) { columnStatisticMap.put(slotReference, cache); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index 72d0c68f0f..77796e04eb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -19,7 +19,6 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; import org.apache.doris.datasource.InternalCatalog; @@ -89,21 +88,12 @@ public class ColumnStatistic { public final LiteralExpr minExpr; public final LiteralExpr maxExpr; - @SerializedName("histogram") - // assign value when do stats estimation. - public final Histogram histogram; - - @SerializedName("partitionIdToColStats") - public final Map partitionIdToColStats = new HashMap<>(); - public final String updatedTime; - public final PartitionInfo partitionInfo; - public ColumnStatistic(double count, double ndv, ColumnStatistic original, double avgSizeByte, double numNulls, double dataSize, double minValue, double maxValue, - LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, Histogram histogram, - String updatedTime, PartitionInfo partitionInfo) { + LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, + String updatedTime) { this.count = count; this.ndv = ndv; this.original = original; @@ -115,9 +105,7 @@ public class ColumnStatistic { this.minExpr = minExpr; this.maxExpr = maxExpr; this.isUnKnown = isUnKnown; - this.histogram = histogram; this.updatedTime = updatedTime; - this.partitionInfo = partitionInfo; } public static ColumnStatistic fromResultRow(List resultRows) { @@ -139,7 +127,6 @@ public class ColumnStatistic { if (columnStatistic == null) { return ColumnStatistic.UNKNOWN; } - columnStatistic.partitionIdToColStats.putAll(partitionIdToColStats); return columnStatistic; } @@ -242,7 +229,7 @@ public class ColumnStatistic { public ColumnStatistic updateBySelectivity(double selectivity, double rowCount) { if (isUnKnown) { - return UNKNOWN; + return this; } ColumnStatisticBuilder builder = new ColumnStatisticBuilder(this); Double rowsAfterFilter = rowCount * selectivity; @@ -329,7 +316,6 @@ public class ColumnStatistic { statistic.put("MinExpr", minExpr); statistic.put("MaxExpr", maxExpr); statistic.put("IsUnKnown", isUnKnown); - statistic.put("Histogram", Histogram.serializeToJson(histogram)); statistic.put("Original", original); statistic.put("LastUpdatedTime", updatedTime); return statistic; @@ -379,8 +365,7 @@ public class ColumnStatistic { null, null, stat.getBoolean("IsUnKnown"), - Histogram.deserializeFromJson(stat.getString("Histogram")), - stat.getString("LastUpdatedTime"), null + stat.getString("LastUpdatedTime") ); } @@ -388,10 +373,6 @@ public class ColumnStatistic { return Double.isInfinite(maxValue) || Double.isInfinite(minValue); } - public boolean hasHistogram() { - return histogram != null && histogram != Histogram.UNKNOWN; - } - public double getOriginalNdv() { if (original != null) { return original.ndv; @@ -399,16 +380,7 @@ public class ColumnStatistic { return ndv; } - // TODO expanded this function to support more cases, help to compute the change of ndv density - public boolean rangeChanged() { - return original != null && (minValue != original.minValue || maxValue != original.maxValue); - } - public boolean isUnKnown() { return isUnKnown; } - - public void putPartStats(String partId, ColumnStatistic columnStatistic) { - this.partitionIdToColStats.put(partId, columnStatistic); - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index f97459555c..f8ed6a1b6a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -18,10 +18,6 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.LiteralExpr; -import org.apache.doris.catalog.PartitionInfo; - -import java.util.HashMap; -import java.util.Map; public class ColumnStatisticBuilder { private double count; @@ -36,28 +32,13 @@ public class ColumnStatisticBuilder { private boolean isUnknown; - private Histogram histogram; - private ColumnStatistic original; - private Map partitionIdToColStats = new HashMap<>(); - private String updatedTime; - private PartitionInfo partitionInfo; - public ColumnStatisticBuilder() { } - public PartitionInfo getPartitionInfo() { - return partitionInfo; - } - - public ColumnStatisticBuilder setPartitionInfo(PartitionInfo partitionInfo) { - this.partitionInfo = partitionInfo; - return this; - } - public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.count = columnStatistic.count; this.ndv = columnStatistic.ndv; @@ -69,11 +50,8 @@ public class ColumnStatisticBuilder { this.minExpr = columnStatistic.minExpr; this.maxExpr = columnStatistic.maxExpr; this.isUnknown = columnStatistic.isUnKnown; - this.histogram = columnStatistic.histogram; this.original = columnStatistic.original; - this.partitionIdToColStats.putAll(columnStatistic.partitionIdToColStats); this.updatedTime = columnStatistic.updatedTime; - this.partitionInfo = columnStatistic.partitionInfo; } public ColumnStatisticBuilder setCount(double count) { @@ -171,15 +149,6 @@ public class ColumnStatisticBuilder { return isUnknown; } - public Histogram getHistogram() { - return histogram; - } - - public ColumnStatisticBuilder setHistogram(Histogram histogram) { - this.histogram = histogram; - return this; - } - public String getUpdatedTime() { return updatedTime; } @@ -194,13 +163,11 @@ public class ColumnStatisticBuilder { if (original == null && !isUnknown) { original = new ColumnStatistic(count, ndv, null, avgSizeByte, numNulls, dataSize, minValue, maxValue, minExpr, maxExpr, - isUnknown, histogram, updatedTime, partitionInfo); - original.partitionIdToColStats.putAll(partitionIdToColStats); + isUnknown, updatedTime); } ColumnStatistic colStats = new ColumnStatistic(count, ndv, original, avgSizeByte, numNulls, dataSize, minValue, maxValue, minExpr, maxExpr, - isUnknown, histogram, updatedTime, partitionInfo); - colStats.partitionIdToColStats.putAll(partitionIdToColStats); + isUnknown, updatedTime); return colStats; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java index c9b049a8cf..84110d5bda 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java @@ -39,11 +39,9 @@ import org.apache.logging.log4j.Logger; import java.time.Duration; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -205,11 +203,6 @@ public class StatisticsCache { LOG.warn("Error when preheating stats cache", t); } } - try { - loadPartStats(keyToColStats); - } catch (Exception e) { - LOG.warn("Fucka", e); - } } /** @@ -262,40 +255,4 @@ public class StatisticsCache { f.obtrudeValue(Optional.of(c)); columnStatisticsCache.put(k, f); } - - private void loadPartStats(Map keyToColStats) { - final int batchSize = Config.expr_children_limit; - Set keySet = new HashSet<>(); - for (StatisticsCacheKey statisticsCacheKey : keyToColStats.keySet()) { - if (keySet.size() < batchSize - 1) { - keySet.add(statisticsCacheKey); - } else { - List partStats = StatisticsRepository.loadPartStats(keySet); - addPartStatsToColStats(keyToColStats, partStats); - keySet = new HashSet<>(); - } - } - if (!keySet.isEmpty()) { - List partStats = StatisticsRepository.loadPartStats(keySet); - addPartStatsToColStats(keyToColStats, partStats); - } - } - - private void addPartStatsToColStats(Map keyToColStats, - List partsStats) { - for (ResultRow r : partsStats) { - try { - StatsId statsId = new StatsId(r); - long tblId = statsId.tblId; - long idxId = statsId.idxId; - String partId = statsId.partId; - String colId = statsId.colId; - ColumnStatistic partStats = ColumnStatistic.fromResultRow(r); - keyToColStats.get(new StatisticsCacheKey(tblId, idxId, colId)).putPartStats(partId, partStats); - } catch (Throwable t) { - LOG.warn("Failed to deserialized part stats", t); - } - } - } - } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java index 6a84364260..10be8c23a3 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java @@ -361,8 +361,8 @@ public class HyperGraphBuilder { for (Slot slot : scanPlan.getOutput()) { slotIdToColumnStats.put(slot, new ColumnStatistic(count, count, null, 1, 0, 0, 0, - count, null, null, true, null, - new Date().toString(), null)); + count, null, null, true, + new Date().toString())); } return new Statistics(count, slotIdToColumnStats); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java index cc1fa8353b..9fe8b09492 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java @@ -241,8 +241,8 @@ public class CacheTest extends TestWithFeService { public Optional getColumnStatistic(String colName) { return Optional.of(new ColumnStatistic(1, 2, null, 3, 4, 5, 6, 7, - null, null, false, null, - new Date().toString(), null)); + null, null, false, + new Date().toString())); } }; diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java index a1ff5b1358..c3f04bccfc 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java @@ -29,8 +29,8 @@ public class StatsDeriveResultTest { public void testUpdateRowCountByLimit() { StatsDeriveResult stats = new StatsDeriveResult(100); ColumnStatistic a = new ColumnStatistic(100, 10, null, 1, 5, 10, - 1, 100, null, null, false, null, - new Date().toString(), null); + 1, 100, null, null, false, + new Date().toString()); Id id = new Id(1); stats.addColumnStats(id, a); StatsDeriveResult res = stats.updateByLimit(0);