[fix](statistics) Use column update rows to decide min/max stats are valid or not (#34263)

This is a following pr of #33685
After #33703 merged, need to check update rows in column level instead of table level.
This commit is contained in:
Jibing-Li
2024-04-30 16:18:06 +08:00
committed by yiguolei
parent 971e10a9db
commit 8db4d48731

View File

@ -19,6 +19,7 @@ package org.apache.doris.nereids.stats;
import org.apache.doris.analysis.IntLiteral;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
@ -124,6 +125,7 @@ import org.apache.doris.nereids.types.DataType;
import org.apache.doris.nereids.util.PlanUtils;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.statistics.AnalysisManager;
import org.apache.doris.statistics.ColStatsMeta;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
import org.apache.doris.statistics.Histogram;
@ -764,10 +766,10 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
Set<SlotReference> slotSet = slotSetBuilder.build();
Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap = new HashMap<>();
TableIf table = catalogRelation.getTable();
boolean isOlapTable = table instanceof OlapTable;
AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager();
TableStatsMeta tableMeta = analysisManager.findTableStatsStatus(table.getId());
// rows newly updated after last analyze
long deltaRowCount = tableMeta == null ? 0 : tableMeta.updatedRows.get();
long tableUpdatedRows = tableMeta == null ? 0 : tableMeta.updatedRows.get();
double rowCount = catalogRelation.getTable().getRowCountForNereids();
boolean hasUnknownCol = false;
long idxId = -1;
@ -777,10 +779,6 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
idxId = olapScan.getSelectedIndexId();
}
}
if (deltaRowCount > 0 && LOG.isDebugEnabled()) {
LOG.debug("{} is partially analyzed, clear min/max values in column stats",
catalogRelation.getTable().getName());
}
for (SlotReference slotReference : slotSet) {
String colName = slotReference.getColumn().isPresent()
? slotReference.getColumn().get().getName()
@ -790,6 +788,13 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
if (colName == null) {
throw new RuntimeException(String.format("Invalid slot: %s", slotReference.getExprId()));
}
long deltaRowCount = 0;
if (isOlapTable) {
OlapTable olapTable = (OlapTable) table;
ColStatsMeta colMeta = tableMeta == null ? null : tableMeta.findColumnStatsMeta(
olapTable.getIndexNameById(idxId == -1 ? olapTable.getBaseIndexId() : idxId), colName);
deltaRowCount = colMeta == null ? 0 : tableUpdatedRows - colMeta.updatedRows;
}
ColumnStatistic cache;
if (!FeConstants.enableInternalSchemaDb
|| shouldIgnoreThisCol) {
@ -807,13 +812,20 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
hasUnknownCol = true;
}
if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) {
// deltaRowCount > 0 indicates that
// new data is loaded to the table after this column was analyzed last time.
// In this case, need to eliminate min/max value for this column.
if (deltaRowCount > 0) {
// clear min-max to avoid error estimation
// for example, after yesterday data loaded, user send query about yesterday immediately.
// since yesterday data are not analyzed, the max date is before yesterday, and hence optimizer
// estimates the filter result is zero
colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
.setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
.setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
if (LOG.isDebugEnabled()) {
LOG.debug("{}.{} is partially analyzed, clear min/max values in column stats",
table.getName(), colName);
}
}
columnStatisticBuilderMap.put(slotReference, colStatsBuilder);
} else {