[improvement](statistics)Improve statistics cache loading logic. (#38829) (#39410)

backport: https://github.com/apache/doris/pull/38829
This commit is contained in:
Jibing-Li
2024-08-15 17:01:24 +08:00
committed by GitHub
parent 0680c8d314
commit 01090cf61f
7 changed files with 136 additions and 124 deletions

View File

@ -610,21 +610,13 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
continue;
}
ColumnStatisticsData data = tableStat.getStatsData();
try {
setStatData(column, data, columnStatisticBuilder, count);
} catch (AnalysisException e) {
if (LOG.isDebugEnabled()) {
LOG.debug(e);
}
return Optional.empty();
}
setStatData(column, data, columnStatisticBuilder, count);
}
return Optional.of(columnStatisticBuilder.build());
}
private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, long count)
throws AnalysisException {
private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, long count) {
long ndv = 0;
long nulls = 0;
double colSize = 0;

View File

@ -2547,7 +2547,14 @@ public class ShowExecutor {
if (indexName == null) {
continue;
}
columnStatistics.add(Pair.of(Pair.of(indexName, row.get(5)), ColumnStatistic.fromResultRow(row)));
try {
columnStatistics.add(Pair.of(Pair.of(indexName, row.get(5)), ColumnStatistic.fromResultRow(row)));
} catch (Exception e) {
LOG.warn("Failed to deserialize column statistics. reason: [{}]. Row [{}]", e.getMessage(), row);
if (LOG.isDebugEnabled()) {
LOG.debug(e);
}
}
}
}

View File

@ -112,99 +112,81 @@ public class ColumnStatistic {
public static ColumnStatistic fromResultRow(List<ResultRow> resultRows) {
Map<String, ColumnStatistic> partitionIdToColStats = new HashMap<>();
ColumnStatistic columnStatistic = null;
try {
for (ResultRow resultRow : resultRows) {
String partId = resultRow.get(6);
if (partId == null) {
columnStatistic = fromResultRow(resultRow);
} else {
partitionIdToColStats.put(partId, fromResultRow(resultRow));
}
for (ResultRow resultRow : resultRows) {
String partId = resultRow.get(6);
if (partId == null) {
columnStatistic = fromResultRow(resultRow);
} else {
partitionIdToColStats.put(partId, fromResultRow(resultRow));
}
} catch (Throwable t) {
if (LOG.isDebugEnabled()) {
LOG.debug("Failed to deserialize column stats", t);
}
return ColumnStatistic.UNKNOWN;
}
if (columnStatistic == null) {
return ColumnStatistic.UNKNOWN;
}
return columnStatistic;
}
// TODO: use thrift
public static ColumnStatistic fromResultRow(ResultRow row) {
try {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
double count = Double.parseDouble(row.get(7));
columnStatisticBuilder.setCount(count);
double ndv = Double.parseDouble(row.getWithDefault(8, "0"));
columnStatisticBuilder.setNdv(ndv);
String nullCount = row.getWithDefault(9, "0");
columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount));
columnStatisticBuilder.setDataSize(Double
.parseDouble(row.getWithDefault(12, "0")));
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0
? 0 : columnStatisticBuilder.getDataSize()
/ columnStatisticBuilder.getCount());
long catalogId = Long.parseLong(row.get(1));
long idxId = Long.parseLong(row.get(4));
long dbID = Long.parseLong(row.get(2));
long tblId = Long.parseLong(row.get(3));
String colName = row.get(5);
Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName);
if (col == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Failed to deserialize column statistics, ctlId: {} dbId: {}"
+ "tblId: {} column: {} not exists",
catalogId, dbID, tblId, colName);
}
return ColumnStatistic.UNKNOWN;
}
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
// Internal catalog get the min/max value using a separate SQL,
// and the value is already encoded by base64. Need to handle internal and external catalog separately.
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && min.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
} else {
try {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} min value {}.", col, min, e);
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
}
} else {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && max.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
} else {
try {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} max value {}.", col, max, e);
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
}
} else {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
columnStatisticBuilder.setUpdatedTime(row.get(13));
return columnStatisticBuilder.build();
} catch (Exception e) {
LOG.warn("Failed to deserialize column statistics. reason: [{}]. Row [{}]", e.getMessage(), row);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
double count = Double.parseDouble(row.get(7));
columnStatisticBuilder.setCount(count);
double ndv = Double.parseDouble(row.getWithDefault(8, "0"));
columnStatisticBuilder.setNdv(ndv);
String nullCount = row.getWithDefault(9, "0");
columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount));
columnStatisticBuilder.setDataSize(Double
.parseDouble(row.getWithDefault(12, "0")));
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0
? 0 : columnStatisticBuilder.getDataSize()
/ columnStatisticBuilder.getCount());
long catalogId = Long.parseLong(row.get(1));
long idxId = Long.parseLong(row.get(4));
long dbID = Long.parseLong(row.get(2));
long tblId = Long.parseLong(row.get(3));
String colName = row.get(5);
Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName);
if (col == null) {
if (LOG.isDebugEnabled()) {
LOG.debug(e);
LOG.debug("Failed to deserialize column statistics, ctlId: {} dbId: {}"
+ "tblId: {} column: {} not exists",
catalogId, dbID, tblId, colName);
}
return ColumnStatistic.UNKNOWN;
}
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
// Internal catalog get the min/max value using a separate SQL,
// and the value is already encoded by base64. Need to handle internal and external catalog separately.
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && min.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
} else {
try {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} min value {}.", col, min, e);
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
}
} else {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && max.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
} else {
try {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} max value {}.", col, max, e);
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
}
} else {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
columnStatisticBuilder.setUpdatedTime(row.get(13));
return columnStatisticBuilder.build();
}
public static boolean isAlmostUnique(double ndv, double rowCount) {

View File

@ -18,7 +18,6 @@
package org.apache.doris.statistics;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.qe.InternalQueryExecutionException;
import org.apache.doris.statistics.util.StatisticsUtil;
import org.apache.logging.log4j.LogManager;
@ -33,22 +32,14 @@ public class ColumnStatisticsCacheLoader extends BasicAsyncCacheLoader<Statistic
@Override
protected Optional<ColumnStatistic> doLoad(StatisticsCacheKey key) {
Optional<ColumnStatistic> columnStatistic = Optional.empty();
Optional<ColumnStatistic> columnStatistic;
try {
// Load from statistics table.
columnStatistic = loadFromStatsTable(key);
if (!columnStatistic.isPresent()) {
// Load from data source metadata
try {
TableIf table = StatisticsUtil.findTable(key.catalogId, key.dbId, key.tableId);
columnStatistic = table.getColumnStatistic(key.colName);
} catch (Exception e) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Exception to get column statistics by metadata."
+ "[Catalog:{}, DB:{}, Table:{}]",
key.catalogId, key.dbId, key.tableId), e);
}
}
TableIf table = StatisticsUtil.findTable(key.catalogId, key.dbId, key.tableId);
columnStatistic = table.getColumnStatistic(key.colName);
}
} catch (Throwable t) {
LOG.warn("Failed to load stats for column [Catalog:{}, DB:{}, Table:{}, Column:{}], Reason: {}",
@ -56,6 +47,7 @@ public class ColumnStatisticsCacheLoader extends BasicAsyncCacheLoader<Statistic
if (LOG.isDebugEnabled()) {
LOG.debug(t);
}
return null;
}
if (columnStatistic.isPresent()) {
// For non-empty table, return UNKNOWN if we can't collect ndv value.
@ -69,22 +61,9 @@ public class ColumnStatisticsCacheLoader extends BasicAsyncCacheLoader<Statistic
}
private Optional<ColumnStatistic> loadFromStatsTable(StatisticsCacheKey key) {
List<ResultRow> columnResults = null;
try {
columnResults = StatisticsRepository.loadColStats(
List<ResultRow> columnResults = StatisticsRepository.loadColStats(
key.catalogId, key.dbId, key.tableId, key.idxId, key.colName);
} catch (InternalQueryExecutionException e) {
LOG.info("Failed to load stats for table {} column {}. Reason:{}",
key.tableId, key.colName, e.getMessage());
return Optional.empty();
}
ColumnStatistic columnStatistics;
try {
columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResults);
} catch (Exception e) {
LOG.warn("Exception to deserialize column statistics", e);
return Optional.empty();
}
ColumnStatistic columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResults);
if (columnStatistics == null) {
return Optional.empty();
} else {

View File

@ -111,11 +111,20 @@ public class StatisticsRepository {
public static ColumnStatistic queryColumnStatisticsByName(
long ctlId, long dbId, long tableId, long indexId, String colName) {
ColumnStatistic columnStatistic = ColumnStatistic.UNKNOWN;
ResultRow resultRow = queryColumnStatisticById(ctlId, dbId, tableId, indexId, colName);
if (resultRow == null) {
return ColumnStatistic.UNKNOWN;
return columnStatistic;
}
return ColumnStatistic.fromResultRow(resultRow);
try {
columnStatistic = ColumnStatistic.fromResultRow(resultRow);
} catch (Exception e) {
LOG.warn("Failed to deserialize column statistics. reason: [{}]. Row [{}]", e.getMessage(), resultRow);
if (LOG.isDebugEnabled()) {
LOG.debug(e);
}
}
return columnStatistic;
}
public static List<ColumnStatistic> queryColumnStatisticsByPartitions(TableName tableName, String colName,

View File

@ -163,8 +163,7 @@ public class StatisticsUtil {
}
}
public static ColumnStatistic deserializeToColumnStatistics(List<ResultRow> resultBatches)
throws Exception {
public static ColumnStatistic deserializeToColumnStatistics(List<ResultRow> resultBatches) {
if (CollectionUtils.isEmpty(resultBatches)) {
return null;
}