[enhancement](histogram) add histogram syntax and perstist histogram statistics (#15490)

Histogram statistics are more expensive to collect and we collect and persist them separately.

This PR does the following work:
1. Add histogram syntax and add keyword `TABLE`
2. Add the task of collecting histogram statistics
3. Persistent histogram statistics
4. Replace fastjson with gson
5. Add unit tests...

Relevant syntax examples:
> Refer to some databases such as mysql and add the keyword `TABLE`.

```SQL
-- collect column statistics
ANALYZE TABLE statistics_test;

-- collect histogram statistics
ANALYZE TABLE statistics_test UPDATE HISTOGRAM ON col1,col2;
```

base on #15317
This commit is contained in:
ElvinWei
2023-01-07 00:55:42 +08:00
committed by GitHub
parent 76ad599fd7
commit 5dfdacd278
33 changed files with 640 additions and 209 deletions

View File

@ -615,7 +615,8 @@ terminal String
KW_WRITE,
KW_YEAR,
KW_MTMV,
KW_TYPECAST;
KW_TYPECAST,
KW_HISTOGRAM;
terminal COMMA, COLON, DOT, DOTDOTDOT, AT, STAR, LPAREN, RPAREN, SEMICOLON, LBRACKET, RBRACKET, DIVIDE, MOD, ADD, SUBTRACT;
terminal BITAND, BITOR, BITXOR, BITNOT;
@ -2648,10 +2649,14 @@ show_create_routine_load_stmt ::=
// analyze statment
analyze_stmt ::=
KW_ANALYZE table_name:tbl opt_col_list:cols opt_partition_names:partitionNames opt_properties:properties
KW_ANALYZE KW_TABLE table_name:tbl opt_col_list:cols opt_partition_names:partitionNames opt_properties:properties
{:
RESULT = new AnalyzeStmt(tbl, cols, partitionNames, properties);
:}
| KW_ANALYZE KW_TABLE table_name:tbl KW_UPDATE KW_HISTOGRAM KW_ON ident_list:cols opt_properties:properties
{:
RESULT = new AnalyzeStmt(tbl, cols, properties);
:}
;
// Grant statement

View File

@ -62,7 +62,6 @@ public class AlterColumnStatsStmt extends DdlStmt {
.add(ColumnStatistic.NUM_NULLS)
.add(ColumnStatistic.MIN_VALUE)
.add(ColumnStatistic.MAX_VALUE)
.add(ColumnStatistic.HISTOGRAM)
.add(StatsType.DATA_SIZE)
.build();

View File

@ -64,6 +64,8 @@ public class AnalyzeStmt extends DdlStmt {
// time to wait for collect statistics
public static final String CBO_STATISTICS_TASK_TIMEOUT_SEC = "cbo_statistics_task_timeout_sec";
public boolean isHistogram = false;
private static final ImmutableSet<String> PROPERTIES_SET = new ImmutableSet.Builder<String>()
.add(CBO_STATISTICS_TASK_TIMEOUT_SEC)
.build();
@ -76,7 +78,7 @@ public class AnalyzeStmt extends DdlStmt {
private TableIf table;
private final PartitionNames optPartitionNames;
private PartitionNames optPartitionNames;
private List<String> optColumnNames;
private Map<String, String> optProperties;
@ -85,6 +87,16 @@ public class AnalyzeStmt extends DdlStmt {
private final List<String> partitionNames = Lists.newArrayList();
public AnalyzeStmt(TableName tableName,
List<String> optColumnNames,
Map<String, String> optProperties) {
this.tableName = tableName;
this.optColumnNames = optColumnNames;
wholeTbl = CollectionUtils.isEmpty(optColumnNames);
isHistogram = true;
this.optProperties = optProperties;
}
public AnalyzeStmt(TableName tableName,
List<String> optColumnNames,
PartitionNames optPartitionNames,

View File

@ -2612,6 +2612,8 @@ public class FunctionSet<T> {
"", "", "", "", "", true, false, true, true));
addBuiltin(AggregateFunction.createBuiltin(HISTOGRAM, Lists.newArrayList(t, Type.DOUBLE, Type.INT), Type.VARCHAR, t,
"", "", "", "", "", true, false, true, true));
addBuiltin(AggregateFunction.createBuiltin(HISTOGRAM, Lists.newArrayList(t, Type.DOUBLE, Type.INT), Type.VARCHAR, t,
"", "", "", "", "", true, false, true, true));
}
// Avg

View File

@ -79,6 +79,7 @@ public class InternalSchemaInitializer extends Thread {
private void createTbl() throws UserException {
Env.getCurrentEnv().getInternalCatalog().createTable(buildStatisticsTblStmt());
Env.getCurrentEnv().getInternalCatalog().createTable(buildHistogramTblStmt());
Env.getCurrentEnv().getInternalCatalog().createTable(buildAnalysisJobTblStmt());
}
@ -115,7 +116,6 @@ public class InternalSchemaInitializer extends Thread {
columnDefs.add(new ColumnDef("null_count", TypeDef.create(PrimitiveType.BIGINT)));
columnDefs.add(new ColumnDef("min", TypeDef.createVarchar(ScalarType.MAX_VARCHAR_LENGTH)));
columnDefs.add(new ColumnDef("max", TypeDef.createVarchar(ScalarType.MAX_VARCHAR_LENGTH)));
columnDefs.add(new ColumnDef("histogram", TypeDef.createVarchar(ScalarType.MAX_VARCHAR_LENGTH)));
columnDefs.add(new ColumnDef("data_size_in_bytes", TypeDef.create(PrimitiveType.BIGINT)));
columnDefs.add(new ColumnDef("update_time", TypeDef.create(PrimitiveType.DATETIME)));
String engineName = "olap";
@ -138,6 +138,39 @@ public class InternalSchemaInitializer extends Thread {
return createTableStmt;
}
@VisibleForTesting
public CreateTableStmt buildHistogramTblStmt() throws UserException {
TableName tableName = new TableName("",
FeConstants.INTERNAL_DB_NAME, StatisticConstants.HISTOGRAM_TBL_NAME);
List<ColumnDef> columnDefs = new ArrayList<>();
columnDefs.add(new ColumnDef("id", TypeDef.createVarchar(StatisticConstants.ID_LEN)));
columnDefs.add(new ColumnDef("catalog_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)));
columnDefs.add(new ColumnDef("db_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)));
columnDefs.add(new ColumnDef("tbl_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)));
columnDefs.add(new ColumnDef("idx_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)));
columnDefs.add(new ColumnDef("col_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)));
columnDefs.add(new ColumnDef("sample_rate", TypeDef.create(PrimitiveType.DOUBLE)));
columnDefs.add(new ColumnDef("buckets", TypeDef.createVarchar(ScalarType.MAX_VARCHAR_LENGTH)));
columnDefs.add(new ColumnDef("update_time", TypeDef.create(PrimitiveType.DATETIME)));
String engineName = "olap";
KeysDesc keysDesc = new KeysDesc(KeysType.UNIQUE_KEYS,
Lists.newArrayList("id"));
DistributionDesc distributionDesc = new HashDistributionDesc(
StatisticConstants.STATISTIC_TABLE_BUCKET_COUNT,
Lists.newArrayList("id"));
Map<String, String> properties = new HashMap<String, String>() {
{
put("replication_num", String.valueOf(Config.statistic_internal_table_replica_num));
}
};
CreateTableStmt createTableStmt = new CreateTableStmt(true, false,
tableName, columnDefs, engineName, keysDesc, null, distributionDesc,
properties, null, "Doris internal statistics table, don't modify it", null);
StatisticsUtil.analyze(createTableStmt);
// createTableStmt.setClusterName(SystemInfoService.DEFAULT_CLUSTER);
return createTableStmt;
}
@VisibleForTesting
public CreateTableStmt buildAnalysisJobTblStmt() throws UserException {
TableName tableName = new TableName("",

View File

@ -51,6 +51,7 @@ import org.apache.doris.statistics.AnalysisTaskInfo;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisType;
import org.apache.doris.statistics.AnalysisTaskScheduler;
import org.apache.doris.statistics.BaseAnalysisTask;
import org.apache.doris.statistics.HistogramTask;
import org.apache.doris.statistics.MVAnalysisTask;
import org.apache.doris.statistics.OlapAnalysisTask;
import org.apache.doris.system.Backend;
@ -1005,6 +1006,9 @@ public class OlapTable extends Table {
@Override
public BaseAnalysisTask createAnalysisTask(AnalysisTaskScheduler scheduler, AnalysisTaskInfo info) {
if (info.analysisType.equals(AnalysisType.HISTOGRAM)) {
return new HistogramTask(scheduler, info);
}
if (info.analysisType.equals(AnalysisType.COLUMN)) {
return new OlapAnalysisTask(scheduler, info);
}

View File

@ -290,6 +290,7 @@ public class StatsCalculator extends DefaultPlanVisitor<StatsDeriveResult, Void>
return computeTopN(topN);
}
@Override
public StatsDeriveResult visitPhysicalLocalQuickSort(PhysicalLocalQuickSort<? extends Plan> sort, Void context) {
return groupExpression.childStatistics(0);
}
@ -447,7 +448,6 @@ public class StatsCalculator extends DefaultPlanVisitor<StatsDeriveResult, Void>
stats.dataSize < 0 ? stats.dataSize : stats.dataSize * groupingSetNum,
stats.minValue,
stats.maxValue,
stats.histogram,
stats.selectivity,
stats.minExpr,
stats.maxExpr,
@ -538,7 +538,6 @@ public class StatsCalculator extends DefaultPlanVisitor<StatsDeriveResult, Void>
leftStats.dataSize + rightStats.dataSize,
Math.min(leftStats.minValue, rightStats.minValue),
Math.max(leftStats.maxValue, rightStats.maxValue),
null,
1.0 / (leftStats.ndv + rightStats.ndv),
leftStats.minExpr,
leftStats.maxExpr,

View File

@ -71,7 +71,7 @@ public class Numbers extends TableValuedFunction {
Map<Id, ColumnStatistic> columnToStatistics = Maps.newHashMap();
ColumnStatistic columnStat = new ColumnStatistic(rowNum, rowNum, 8, 0, 8, 0, rowNum - 1,
null, 1.0 / rowNum, new IntLiteral(0, Type.BIGINT), new IntLiteral(rowNum - 1, Type.BIGINT), false);
1.0 / rowNum, new IntLiteral(0, Type.BIGINT), new IntLiteral(rowNum - 1, Type.BIGINT), false);
columnToStatistics.put(slots.get(0).getExprId(), columnStat);
return new StatsDeriveResult(rowNum, columnToStatistics);
} catch (Exception t) {

View File

@ -43,6 +43,8 @@ import java.util.concurrent.ConcurrentMap;
public class AnalysisManager {
public final AnalysisTaskScheduler taskScheduler;
private static final Logger LOG = LogManager.getLogger(AnalysisManager.class);
private static final String UPDATE_JOB_STATE_SQL_TEMPLATE = "UPDATE "
@ -51,8 +53,6 @@ public class AnalysisManager {
private final ConcurrentMap<Long, Map<Long, AnalysisTaskInfo>> analysisJobIdToTaskMap;
public final AnalysisTaskScheduler taskScheduler;
private StatisticsCache statisticsCache;
private final AnalysisTaskExecutor taskExecutor;
@ -76,10 +76,11 @@ public class AnalysisManager {
if (colNames != null) {
for (String colName : colNames) {
long taskId = Env.getCurrentEnv().getNextId();
AnalysisType analType = analyzeStmt.isHistogram ? AnalysisType.HISTOGRAM : AnalysisType.COLUMN;
AnalysisTaskInfo analysisTaskInfo = new AnalysisTaskInfoBuilder().setJobId(jobId)
.setTaskId(taskId).setCatalogName(catalogName).setDbName(db)
.setTblName(tbl.getTbl()).setColName(colName).setJobType(JobType.MANUAL)
.setAnalysisMethod(AnalysisMethod.FULL).setAnalysisType(AnalysisType.COLUMN)
.setAnalysisMethod(AnalysisMethod.FULL).setAnalysisType(analType)
.setState(AnalysisState.PENDING)
.setScheduleType(ScheduleType.ONCE).build();
try {

View File

@ -34,7 +34,8 @@ public class AnalysisTaskInfo {
public enum AnalysisType {
COLUMN,
INDEX
INDEX,
HISTOGRAM
}
public enum JobType {
@ -69,6 +70,10 @@ public class AnalysisTaskInfo {
public final AnalysisType analysisType;
// TODO: define constants or get them from configuration properties
public final double sampleRate = 0.2;
public final int maxBucketNum = 128;
public String message;
// finished or failed

View File

@ -26,9 +26,13 @@ import org.apache.doris.qe.StmtExecutor;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisType;
import com.google.common.annotations.VisibleForTesting;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public abstract class BaseAnalysisTask {
public static final Logger LOG = LogManager.getLogger(BaseAnalysisTask.class);
protected static final String INSERT_PART_STATISTICS = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
+ " SELECT "
@ -119,7 +123,8 @@ public abstract class BaseAnalysisTask {
info, AnalysisState.FAILED,
String.format("Table with name %s not exists", info.tblName), System.currentTimeMillis());
}
if (info.analysisType != null && info.analysisType.equals(AnalysisType.COLUMN)) {
if (info.analysisType != null && (info.analysisType.equals(AnalysisType.COLUMN)
|| info.analysisType.equals(AnalysisType.HISTOGRAM))) {
col = tbl.getColumn(info.colName);
if (col == null) {
Env.getCurrentEnv().getAnalysisManager().updateTaskStatus(

View File

@ -18,6 +18,15 @@
package org.apache.doris.statistics;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.collect.Lists;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import java.util.List;
public class Bucket {
public LiteralExpr lower;
@ -65,4 +74,20 @@ public class Bucket {
public void setNdv(int ndv) {
this.ndv = ndv;
}
public static List<Bucket> deserializeFromjson(Type datatype, JsonArray jsonArray)
throws AnalysisException {
List<Bucket> buckets = Lists.newArrayList();
for (int i = 0; i < jsonArray.size(); i++) {
JsonObject bucketJson = jsonArray.get(i).getAsJsonObject();
Bucket bucket = new Bucket();
bucket.lower = StatisticsUtil.readableValue(datatype, bucketJson.get("lower").getAsString());
bucket.upper = StatisticsUtil.readableValue(datatype, bucketJson.get("upper").getAsString());
bucket.count = bucketJson.get("count").getAsInt();
bucket.preSum = bucketJson.get("pre_sum").getAsInt();
bucket.ndv = bucketJson.get("ndv").getAsInt();
buckets.add(bucket);
}
return buckets;
}
}

View File

@ -19,12 +19,7 @@ package org.apache.doris.statistics;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.DatabaseIf;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.catalog.Type;
import org.apache.doris.datasource.CatalogIf;
import org.apache.doris.statistics.util.InternalQueryResult.ResultRow;
import org.apache.doris.statistics.util.StatisticsUtil;
@ -42,13 +37,12 @@ public class ColumnStatistic {
public static final StatsType NUM_NULLS = StatsType.NUM_NULLS;
public static final StatsType MIN_VALUE = StatsType.MIN_VALUE;
public static final StatsType MAX_VALUE = StatsType.MAX_VALUE;
public static final StatsType HISTOGRAM = StatsType.HISTOGRAM;
private static final Logger LOG = LogManager.getLogger(ColumnStatistic.class);
public static ColumnStatistic DEFAULT = new ColumnStatisticBuilder().setAvgSizeByte(1).setNdv(1)
.setNumNulls(1).setCount(1).setMaxValue(Double.MAX_VALUE).setMinValue(Double.MIN_VALUE)
.setHistogram(Histogram.defaultHistogram()).setSelectivity(1.0).setIsUnknown(true)
.setSelectivity(1.0).setIsUnknown(true)
.build();
public static final Set<Type> MAX_MIN_UNSUPPORTED_TYPE = new HashSet<>();
@ -68,7 +62,6 @@ public class ColumnStatistic {
public final double avgSizeByte;
public final double minValue;
public final double maxValue;
public final Histogram histogram;
public final boolean isUnKnown;
/*
selectivity of Column T1.A:
@ -90,8 +83,7 @@ public class ColumnStatistic {
public ColumnStatistic(double count, double ndv, double avgSizeByte,
double numNulls, double dataSize, double minValue, double maxValue,
Histogram histogram, double selectivity, LiteralExpr minExpr,
LiteralExpr maxExpr, boolean isUnKnown) {
double selectivity, LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown) {
this.count = count;
this.ndv = ndv;
this.avgSizeByte = avgSizeByte;
@ -99,7 +91,6 @@ public class ColumnStatistic {
this.dataSize = dataSize;
this.minValue = minValue;
this.maxValue = maxValue;
this.histogram = histogram;
this.selectivity = selectivity;
this.minExpr = minExpr;
this.maxExpr = maxExpr;
@ -127,7 +118,7 @@ public class ColumnStatistic {
long dbID = Long.parseLong(resultRow.getColumnValue("db_id"));
long tblId = Long.parseLong(resultRow.getColumnValue("tbl_id"));
String colName = resultRow.getColumnValue("col_id");
Column col = findColumn(catalogId, dbID, tblId, idxId, colName);
Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName);
if (col == null) {
LOG.warn("Failed to deserialize column statistics, ctlId: {} dbId: {}"
+ "tblId: {} column: {} not exists",
@ -136,10 +127,8 @@ public class ColumnStatistic {
}
String min = resultRow.getColumnValue("min");
String max = resultRow.getColumnValue("max");
String histogram = resultRow.getColumnValue("histogram");
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setHistogram(Histogram.deserializeFromJson(col.getType(), histogram));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
columnStatisticBuilder.setSelectivity(1.0);
@ -158,7 +147,7 @@ public class ColumnStatistic {
public ColumnStatistic copy() {
return new ColumnStatisticBuilder().setCount(count).setNdv(ndv).setAvgSizeByte(avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(minValue)
.setMaxValue(maxValue).setHistogram(histogram).setMinExpr(minExpr).setMaxExpr(maxExpr)
.setMaxValue(maxValue).setMinExpr(minExpr).setMaxExpr(maxExpr)
.setSelectivity(selectivity).setIsUnknown(isUnKnown).build();
}
@ -182,7 +171,6 @@ public class ColumnStatistic {
.setDataSize(Math.ceil(dataSize * ratio))
.setMinValue(minValue)
.setMaxValue(maxValue)
.setHistogram(histogram)
.setMinExpr(minExpr)
.setMaxExpr(maxExpr)
.setSelectivity(newSelectivity)
@ -194,28 +182,6 @@ public class ColumnStatistic {
return Math.max(this.minValue, other.minValue) <= Math.min(this.maxValue, other.maxValue);
}
public static Column findColumn(long catalogId, long dbId, long tblId, long idxId, String columnName) {
CatalogIf<DatabaseIf<TableIf>> catalogIf = Env.getCurrentEnv().getCatalogMgr().getCatalog(catalogId);
if (catalogIf == null) {
return null;
}
DatabaseIf<TableIf> db = catalogIf.getDb(dbId).orElse(null);
if (db == null) {
return null;
}
TableIf tblIf = db.getTable(tblId).orElse(null);
if (tblIf == null) {
return null;
}
if (idxId != -1) {
if (tblIf instanceof OlapTable) {
OlapTable olapTable = (OlapTable) tblIf;
return olapTable.getIndexIdToMeta().get(idxId).getColumnByName(columnName);
}
}
return tblIf.getColumn(columnName);
}
public ColumnStatistic updateBySelectivity(double selectivity, double rowCount) {
if (isUnKnown) {
return DEFAULT;

View File

@ -27,7 +27,6 @@ public class ColumnStatisticBuilder {
private double dataSize;
private double minValue;
private double maxValue;
private Histogram histogram;
private double selectivity = 1.0;
private LiteralExpr minExpr;
private LiteralExpr maxExpr;
@ -45,7 +44,6 @@ public class ColumnStatisticBuilder {
this.dataSize = columnStatistic.dataSize;
this.minValue = columnStatistic.minValue;
this.maxValue = columnStatistic.maxValue;
this.histogram = columnStatistic.histogram;
this.selectivity = columnStatistic.selectivity;
this.minExpr = columnStatistic.minExpr;
this.maxExpr = columnStatistic.maxExpr;
@ -87,11 +85,6 @@ public class ColumnStatisticBuilder {
return this;
}
public ColumnStatisticBuilder setHistogram(Histogram histogram) {
this.histogram = histogram;
return this;
}
public ColumnStatisticBuilder setSelectivity(double selectivity) {
this.selectivity = selectivity;
return this;
@ -140,10 +133,6 @@ public class ColumnStatisticBuilder {
return maxValue;
}
public Histogram getHistogram() {
return histogram;
}
public double getSelectivity() {
return selectivity;
}
@ -162,6 +151,6 @@ public class ColumnStatisticBuilder {
public ColumnStatistic build() {
return new ColumnStatistic(count, ndv, avgSizeByte, numNulls,
dataSize, minValue, maxValue, histogram, selectivity, minExpr, maxExpr, isUnknown);
dataSize, minValue, maxValue, selectivity, minExpr, maxExpr, isUnknown);
}
}

View File

@ -17,14 +17,15 @@
package org.apache.doris.statistics;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Type;
import org.apache.doris.statistics.util.InternalQueryResult.ResultRow;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.google.common.collect.Lists;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.parquet.Strings;
@ -34,71 +35,71 @@ import java.util.List;
public class Histogram {
private static final Logger LOG = LogManager.getLogger(Histogram.class);
private Type dataType;
public final Type dataType;
private int maxBucketSize;
private int bucketSize;
private float sampleRate;
public final int maxBucketNum;
private List<Bucket> buckets;
public final int bucketNum;
public Histogram(Type dataType) {
public final double sampleRate;
public final List<Bucket> buckets;
public Histogram(Type dataType, int maxBucketNum, int bucketNum,
double sampleRate, List<Bucket> buckets) {
this.dataType = dataType;
}
public Type getDataType() {
return dataType;
}
public void setDataType(Type dataType) {
this.dataType = dataType;
}
public int getMaxBucketSize() {
return maxBucketSize;
}
public void setMaxBucketSize(int maxBucketSize) {
this.maxBucketSize = maxBucketSize;
}
public int getBucketSize() {
return bucketSize;
}
public void setBucketSize(int bucketSize) {
this.bucketSize = bucketSize;
}
public float getSampleRate() {
return sampleRate;
}
public void setSampleRate(float sampleRate) {
if (sampleRate < 0f || sampleRate > 1f) {
this.sampleRate = 1f;
} else {
this.sampleRate = sampleRate;
}
}
public void setBuckets(List<Bucket> buckets) {
this.maxBucketNum = maxBucketNum;
this.bucketNum = bucketNum;
this.sampleRate = sampleRate;
this.buckets = buckets;
}
public List<Bucket> getBuckets() {
return buckets;
}
public static Histogram DEFAULT = new HistogramBuilder()
.setDataType(Type.INVALID).setMaxBucketNum(1)
.setBucketNum(0).setSampleRate(1.0).setBuckets(Lists.newArrayList()).build();
public static Histogram defaultHistogram() {
Type type = Type.fromPrimitiveType(PrimitiveType.INVALID_TYPE);
List<Bucket> buckets = Lists.newArrayList();
Histogram histogram = new Histogram(type);
histogram.setMaxBucketSize(0);
histogram.setBucketSize(0);
histogram.setSampleRate(1.0f);
histogram.setBuckets(buckets);
return histogram;
// TODO: use thrift
public static Histogram fromResultRow(ResultRow resultRow) {
try {
HistogramBuilder histogramBuilder = new HistogramBuilder();
long catalogId = Long.parseLong(resultRow.getColumnValue("catalog_id"));
long idxId = Long.parseLong(resultRow.getColumnValue("idx_id"));
long dbId = Long.parseLong(resultRow.getColumnValue("db_id"));
long tblId = Long.parseLong(resultRow.getColumnValue("tbl_id"));
String colName = resultRow.getColumnValue("col_id");
Column col = StatisticsUtil.findColumn(catalogId, dbId, tblId, idxId, colName);
if (col == null) {
LOG.warn("Failed to deserialize histogram statistics, ctlId: {} dbId: {}"
+ "tblId: {} column: {} not exists",
catalogId, dbId, tblId, colName);
return Histogram.DEFAULT;
}
double sampleRate = Double.parseDouble(resultRow.getColumnValue("sample_rate"));
histogramBuilder.setSampleRate(sampleRate);
histogramBuilder.setDataType(col.getType());
String json = resultRow.getColumnValue("buckets");
JsonObject jsonObj = JsonParser.parseString(json).getAsJsonObject();
int maxBucketNum = jsonObj.get("max_bucket_num").getAsInt();
histogramBuilder.setMaxBucketNum(maxBucketNum);
int bucketNum = jsonObj.get("bucket_num").getAsInt();
histogramBuilder.setBucketNum(bucketNum);
JsonArray jsonArray = jsonObj.getAsJsonArray("buckets");
List<Bucket> buckets = Bucket.deserializeFromjson(col.getType(), jsonArray);
histogramBuilder.setBuckets(buckets);
return histogramBuilder.build();
} catch (Exception e) {
e.printStackTrace();
LOG.warn("Failed to deserialize histogram statistics.", e);
return Histogram.DEFAULT;
}
}
/**
@ -111,37 +112,28 @@ public class Histogram {
}
try {
Histogram histogram = new Histogram(datatype);
JSONObject histogramJson = JSON.parseObject(json);
HistogramBuilder histogramBuilder = new HistogramBuilder();
List<Bucket> buckets = Lists.newArrayList();
JSONArray jsonArray = histogramJson.getJSONArray("buckets");
histogramBuilder.setDataType(datatype);
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject bucketJson = jsonArray.getJSONObject(i);
Bucket bucket = new Bucket();
bucket.lower = StatisticsUtil.readableValue(datatype, bucketJson.get("lower").toString());
bucket.upper = StatisticsUtil.readableValue(datatype, bucketJson.get("upper").toString());
bucket.count = bucketJson.getIntValue("count");
bucket.preSum = bucketJson.getIntValue("pre_sum");
bucket.ndv = bucketJson.getIntValue("ndv");
buckets.add(bucket);
}
JsonObject histogramJson = JsonParser.parseString(json).getAsJsonObject();
JsonArray jsonArray = histogramJson.getAsJsonArray("buckets");
List<Bucket> buckets = Bucket.deserializeFromjson(datatype, jsonArray);
histogram.setBuckets(buckets);
histogramBuilder.setBuckets(buckets);
int maxBucketSize = histogramJson.getIntValue("max_bucket_size");
histogram.setMaxBucketSize(maxBucketSize);
int maxBucketSize = histogramJson.get("max_bucket_num").getAsInt();
histogramBuilder.setMaxBucketNum(maxBucketSize);
int bucketSize = histogramJson.getIntValue("bucket_size");
histogram.setBucketSize(bucketSize);
int bucketSize = histogramJson.get("bucket_num").getAsInt();
histogramBuilder.setBucketNum(bucketSize);
float sampleRate = histogramJson.getFloatValue("sample_rate");
histogram.setSampleRate(sampleRate);
float sampleRate = histogramJson.get("sample_rate").getAsFloat();
histogramBuilder.setSampleRate(sampleRate);
return histogram;
return histogramBuilder.build();
} catch (Throwable e) {
LOG.warn("deserialize from json error, input json string: {}", json, e);
LOG.error("deserialize from json error.", e);
}
return null;
@ -155,26 +147,25 @@ public class Histogram {
return "";
}
JSONObject histogramJson = new JSONObject();
histogramJson.put("max_bucket_size", histogram.maxBucketSize);
histogramJson.put("bucket_size", histogram.bucketSize);
histogramJson.put("sample_rate", histogram.sampleRate);
JsonObject histogramJson = new JsonObject();
JSONArray bucketsJsonArray = new JSONArray();
histogramJson.put("buckets", bucketsJsonArray);
histogramJson.addProperty("max_bucket_num", histogram.maxBucketNum);
histogramJson.addProperty("bucket_num", histogram.bucketNum);
histogramJson.addProperty("sample_rate", histogram.sampleRate);
if (histogram.buckets != null) {
for (Bucket bucket : histogram.buckets) {
JSONObject bucketJson = new JSONObject();
bucketJson.put("count", bucket.count);
bucketJson.put("pre_sum", bucket.preSum);
bucketJson.put("ndv", bucket.ndv);
bucketJson.put("upper", bucket.upper.getStringValue());
bucketJson.put("lower", bucket.lower.getStringValue());
bucketsJsonArray.add(bucketJson);
}
JsonArray bucketsJsonArray = new JsonArray();
histogramJson.add("buckets", bucketsJsonArray);
for (Bucket bucket : histogram.buckets) {
JsonObject bucketJson = new JsonObject();
bucketJson.addProperty("count", bucket.count);
bucketJson.addProperty("pre_sum", bucket.preSum);
bucketJson.addProperty("ndv", bucket.ndv);
bucketJson.addProperty("upper", bucket.upper.getStringValue());
bucketJson.addProperty("lower", bucket.lower.getStringValue());
bucketsJsonArray.add(bucketJson);
}
return histogramJson.toJSONString();
return histogramJson.toString();
}
}

View File

@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.statistics;
import org.apache.doris.catalog.Type;
import java.util.Comparator;
import java.util.List;
public class HistogramBuilder {
private Type dataType;
private int maxBucketNum;
private int bucketNum;
private double sampleRate;
private List<Bucket> buckets;
public HistogramBuilder() {
}
public HistogramBuilder(Histogram histogram) {
this.dataType = histogram.dataType;
this.maxBucketNum = histogram.maxBucketNum;
this.bucketNum = histogram.bucketNum;
this.sampleRate = histogram.sampleRate;
this.buckets = histogram.buckets;
}
public HistogramBuilder setDataType(Type dataType) {
this.dataType = dataType;
return this;
}
public HistogramBuilder setMaxBucketNum(int maxBucketNum) {
this.maxBucketNum = maxBucketNum;
return this;
}
public HistogramBuilder setBucketNum(int bucketNum) {
this.bucketNum = bucketNum;
return this;
}
public HistogramBuilder setSampleRate(double sampleRate) {
if (sampleRate < 0 || sampleRate > 1.0) {
this.sampleRate = 1.0;
} else {
this.sampleRate = sampleRate;
}
return this;
}
public HistogramBuilder setBuckets(List<Bucket> buckets) {
buckets.sort(Comparator.comparing(Bucket::getLower));
this.buckets = buckets;
return this;
}
public Type getDataType() {
return dataType;
}
public int getMaxBucketNum() {
return maxBucketNum;
}
public int getBucketNum() {
return bucketNum;
}
public double getSampleRate() {
return sampleRate;
}
public List<Bucket> getBuckets() {
return buckets;
}
public Histogram build() {
return new Histogram(dataType, maxBucketNum, bucketNum, sampleRate, buckets);
}
}

View File

@ -0,0 +1,90 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.statistics;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.FeConstants;
import org.apache.doris.qe.AutoCloseConnectContext;
import org.apache.doris.qe.StmtExecutor;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.text.StringSubstitutor;
import java.util.HashMap;
import java.util.Map;
/**
* Each task analyze one column.
*/
public class HistogramTask extends BaseAnalysisTask {
/** To avoid too much data, use the following efficient sampling method */
private static final String ANALYZE_HISTOGRAM_SQL_TEMPLATE = "INSERT INTO "
+ "${internalDB}.${histogramStatTbl} "
+ "SELECT "
+ " CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+ " ${catalogId} AS catalog_id, "
+ " ${dbId} AS db_id, "
+ " ${tblId} AS tbl_id, "
+ " ${idxId} AS idx_id, "
+ " '${colId}' AS col_id, "
+ " ${sampleRate} AS sample_rate, "
+ " `HISTOGRAM`(`${colName}`, 1, ${maxBucketNum}) AS buckets, "
+ " NOW() AS create_time "
+ "FROM "
+ " `${dbName}`.`${tblName}` TABLESAMPLE (${percentValue} PERCENT)";
@VisibleForTesting
public HistogramTask() {
super();
}
public HistogramTask(AnalysisTaskScheduler analysisTaskScheduler, AnalysisTaskInfo info) {
super(analysisTaskScheduler, info);
}
@Override
public void execute() throws Exception {
Map<String, String> params = new HashMap<>();
params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
params.put("histogramStatTbl", StatisticConstants.HISTOGRAM_TBL_NAME);
params.put("catalogId", String.valueOf(catalog.getId()));
params.put("dbId", String.valueOf(db.getId()));
params.put("tblId", String.valueOf(tbl.getId()));
params.put("idxId", "-1");
params.put("colId", String.valueOf(info.colName));
params.put("dbName", info.dbName);
params.put("tblName", String.valueOf(info.tblName));
params.put("colName", String.valueOf(info.colName));
params.put("sampleRate", String.valueOf(info.sampleRate));
params.put("maxBucketNum", String.valueOf(info.maxBucketNum));
params.put("percentValue", String.valueOf((int) (info.sampleRate * 100)));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String histogramSql = stringSubstitutor.replace(ANALYZE_HISTOGRAM_SQL_TEMPLATE);
LOG.info("SQL to collect the histogram:\n {}", histogramSql);
try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) {
this.stmtExecutor = new StmtExecutor(r.connectContext, histogramSql);
this.stmtExecutor.execute();
}
Env.getCurrentEnv().getStatisticsCache().refreshSync(tbl.getId(), -1, col.getName());
}
}

View File

@ -58,12 +58,12 @@ public class HiveAnalysisTask extends HMSAnalysisTask {
private static final String ANALYZE_PARTITION_SQL_TEMPLATE = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
+ " values ('${id}','${catalogId}', '${dbId}', '${tblId}', '-1', '${colId}', '${partId}', "
+ "${numRows}, ${ndv}, ${nulls}, '${min}', '${max}', NULL, ${dataSize}, '${update_time}')";
+ "${numRows}, ${ndv}, ${nulls}, '${min}', '${max}', ${dataSize}, '${update_time}')";
private static final String ANALYZE_TABLE_SQL_TEMPLATE = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
+ " values ('${id}','${catalogId}', '${dbId}', '${tblId}', NULL, '${colId}', NULL, "
+ "${numRows}, ${ndv}, ${nulls}, '${min}', '${max}', NULL, ${dataSize}, '${update_time}')";
+ "${numRows}, ${ndv}, ${nulls}, '${min}', '${max}', ${dataSize}, '${update_time}')";
@Override
protected void getColumnStatsByMeta() throws Exception {

View File

@ -52,7 +52,7 @@ public class IcebergAnalysisTask extends HMSAnalysisTask {
private static final String INSERT_TABLE_SQL_TEMPLATE = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
+ " values ('${id}','${catalogId}', '${dbId}', '${tblId}', '-1', '${colId}', NULL, "
+ "${numRows}, 0, ${nulls}, '0', '0', NULL, ${dataSize}, '${update_time}')";
+ "${numRows}, 0, ${nulls}, '0', '0', ${dataSize}, '${update_time}')";
@Override

View File

@ -0,0 +1,55 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.statistics;
public class Statistic {
public Histogram histogram;
public ColumnStatistic columnStatistic;
public Statistic() {
}
public Statistic(Histogram histogram, ColumnStatistic columnStatistic) {
this.histogram = histogram;
this.columnStatistic = columnStatistic;
}
public Histogram getHistogram() {
if (histogram != null) {
return histogram;
}
return Histogram.DEFAULT;
}
public void setHistogram(Histogram histogram) {
this.histogram = histogram;
}
public ColumnStatistic getColumnStatistic() {
if (columnStatistic != null) {
return columnStatistic;
}
return ColumnStatistic.DEFAULT;
}
public void setColumnStatistic(ColumnStatistic columnStatistic) {
this.columnStatistic = columnStatistic;
}
}

View File

@ -22,6 +22,8 @@ import java.util.concurrent.TimeUnit;
public class StatisticConstants {
public static final String STATISTIC_TBL_NAME = "column_statistics";
public static final String HISTOGRAM_TBL_NAME = "histogram_statistics";
public static final String ANALYSIS_JOB_TABLE = "analysis_jobs";
public static final int MAX_NAME_LEN = 64;

View File

@ -31,7 +31,7 @@ public class StatisticsCache {
private static final Logger LOG = LogManager.getLogger(StatisticsCache.class);
private final AsyncLoadingCache<StatisticsCacheKey, ColumnStatistic> cache = Caffeine.newBuilder()
private final AsyncLoadingCache<StatisticsCacheKey, Statistic> cache = Caffeine.newBuilder()
.maximumSize(StatisticConstants.STATISTICS_RECORDS_CACHE_SIZE)
.expireAfterAccess(Duration.ofHours(StatisticConstants.STATISTICS_CACHE_VALID_DURATION_IN_HOURS))
.refreshAfterWrite(Duration.ofHours(StatisticConstants.STATISTICS_CACHE_REFRESH_INTERVAL))
@ -42,29 +42,49 @@ public class StatisticsCache {
}
public ColumnStatistic getColumnStatistics(long tblId, long idxId, String colName) {
if (ConnectContext.get().getSessionVariable().internalSession) {
ConnectContext ctx = ConnectContext.get();
if (ctx != null && ctx.getSessionVariable().internalSession) {
return ColumnStatistic.DEFAULT;
}
StatisticsCacheKey k = new StatisticsCacheKey(tblId, idxId, colName);
try {
CompletableFuture<ColumnStatistic> f = cache.get(k);
if (f.isDone()) {
return f.get();
CompletableFuture<Statistic> f = cache.get(k);
if (f.isDone() && f.get() != null) {
return f.get().getColumnStatistic();
}
} catch (Exception e) {
LOG.warn("Unexpected exception while returning ColumnStatistic", e);
return ColumnStatistic.DEFAULT;
}
return ColumnStatistic.DEFAULT;
}
public Histogram getHistogram(long tblId, String colName) {
return getHistogram(tblId, -1, colName);
}
public Histogram getHistogram(long tblId, long idxId, String colName) {
ConnectContext ctx = ConnectContext.get();
if (ctx != null && ctx.getSessionVariable().internalSession) {
return Histogram.DEFAULT;
}
StatisticsCacheKey k = new StatisticsCacheKey(tblId, idxId, colName);
try {
CompletableFuture<Statistic> f = cache.get(k);
if (f.isDone() && f.get() != null) {
return f.get().getHistogram();
}
} catch (Exception e) {
LOG.warn("Unexpected exception while returning Histogram", e);
}
return Histogram.DEFAULT;
}
// TODO: finish this method.
public void eraseExpiredCache(long tblId, long idxId, String colName) {
cache.synchronous().invalidate(new StatisticsCacheKey(tblId, idxId, colName));
}
public void updateCache(long tblId, long idxId, String colName, ColumnStatistic statistic) {
public void updateCache(long tblId, long idxId, String colName, Statistic statistic) {
cache.synchronous().put(new StatisticsCacheKey(tblId, idxId, colName), statistic);
}

View File

@ -35,7 +35,7 @@ import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.Executor;
public class StatisticsCacheLoader implements AsyncCacheLoader<StatisticsCacheKey, ColumnStatistic> {
public class StatisticsCacheLoader implements AsyncCacheLoader<StatisticsCacheKey, Statistic> {
private static final Logger LOG = LogManager.getLogger(StatisticsCacheLoader.class);
@ -43,13 +43,17 @@ public class StatisticsCacheLoader implements AsyncCacheLoader<StatisticsCacheKe
+ "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE "
+ "id = CONCAT('${tblId}', '-', ${idxId}, '-', '${colId}')";
private static final String QUERY_HISTOGRAM_STATISTICS = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME
+ "." + StatisticConstants.HISTOGRAM_TBL_NAME + " WHERE "
+ "id = CONCAT('${tblId}', '-', ${idxId}, '-', '${colId}')";
private static int CUR_RUNNING_LOAD = 0;
private static final Object LOCK = new Object();
// TODO: Maybe we should trigger a analyze job when the required ColumnStatistic doesn't exists.
@Override
public @NonNull CompletableFuture<ColumnStatistic> asyncLoad(@NonNull StatisticsCacheKey key,
public @NonNull CompletableFuture<Statistic> asyncLoad(@NonNull StatisticsCacheKey key,
@NonNull Executor executor) {
synchronized (LOCK) {
if (CUR_RUNNING_LOAD > StatisticConstants.LOAD_TASK_LIMITS) {
@ -61,31 +65,53 @@ public class StatisticsCacheLoader implements AsyncCacheLoader<StatisticsCacheKe
}
CUR_RUNNING_LOAD++;
return CompletableFuture.supplyAsync(() -> {
Statistic statistic = new Statistic();
try {
Map<String, String> params = new HashMap<>();
params.put("tblId", String.valueOf(key.tableId));
params.put("idxId", String.valueOf(key.idxId));
params.put("colId", String.valueOf(key.colName));
List<ResultRow> resultBatches =
List<ColumnStatistic> columnStatistics;
List<ResultRow> columnResult =
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
.replace(QUERY_COLUMN_STATISTICS));
List<ColumnStatistic> columnStatistics = null;
try {
columnStatistics = StatisticsUtil.deserializeToColumnStatistics(resultBatches);
columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResult);
} catch (Exception e) {
LOG.warn("Failed to deserialize column statistics", e);
throw new CompletionException(e);
}
if (CollectionUtils.isEmpty(columnStatistics)) {
return ColumnStatistic.DEFAULT;
statistic.setColumnStatistic(ColumnStatistic.DEFAULT);
} else {
statistic.setColumnStatistic(columnStatistics.get(0));
}
List<Histogram> histogramStatistics;
List<ResultRow> histogramResult =
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
.replace(QUERY_HISTOGRAM_STATISTICS));
try {
histogramStatistics = StatisticsUtil.deserializeToHistogramStatistics(histogramResult);
} catch (Exception e) {
LOG.warn("Failed to deserialize histogram statistics", e);
throw new CompletionException(e);
}
if (CollectionUtils.isEmpty(histogramStatistics)) {
statistic.setHistogram(Histogram.DEFAULT);
} else {
statistic.setHistogram(histogramStatistics.get(0));
}
return columnStatistics.get(0);
} finally {
synchronized (LOCK) {
CUR_RUNNING_LOAD--;
LOCK.notify();
}
}
return statistic;
});
}
}

View File

@ -74,8 +74,7 @@ public class StatisticsRepository {
private static final String INSERT_INTO_COLUMN_STATISTICS = "INSERT INTO "
+ FULL_QUALIFIED_COLUMN_STATISTICS_NAME + " VALUES('${id}', ${catalogId}, ${dbId}, ${tblId}, '${idxId}',"
+ "'${colId}', ${partId}, ${count}, ${ndv}, ${nullCount}, '${min}', '${max}', "
+ "'${histogram}', ${dataSize}, NOW())";
+ "'${colId}', ${partId}, ${count}, ${ndv}, ${nullCount}, '${min}', '${max}', ${dataSize}, NOW())";
public static ColumnStatistic queryColumnStatisticsByName(long tableId, String colName) {
ResultRow resultRow = queryColumnStatisticById(tableId, colName);
@ -159,7 +158,6 @@ public class StatisticsRepository {
String nullCount = alterColumnStatsStmt.getValue(StatsType.NUM_NULLS);
String min = alterColumnStatsStmt.getValue(StatsType.MIN_VALUE);
String max = alterColumnStatsStmt.getValue(StatsType.MAX_VALUE);
String histogram = alterColumnStatsStmt.getValue(StatsType.HISTOGRAM);
String dataSize = alterColumnStatsStmt.getValue(StatsType.DATA_SIZE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder();
String colName = alterColumnStatsStmt.getColumnName();
@ -181,12 +179,10 @@ public class StatisticsRepository {
builder.setMaxExpr(StatisticsUtil.readableValue(column.getType(), max));
builder.setMaxValue(StatisticsUtil.convertToDouble(column.getType(), max));
}
if (histogram != null) {
builder.setHistogram(Histogram.deserializeFromJson(column.getType(), histogram));
}
if (dataSize != null) {
builder.setDataSize(Double.parseDouble(dataSize));
}
ColumnStatistic columnStatistic = builder.build();
Map<String, String> params = new HashMap<>();
params.put("id", constructId(objects.table.getId(), -1, colName));
@ -201,9 +197,17 @@ public class StatisticsRepository {
params.put("nullCount", String.valueOf(columnStatistic.numNulls));
params.put("min", min == null ? "NULL" : min);
params.put("max", max == null ? "NULL" : max);
params.put("histogram", (columnStatistic.histogram == null) ? "NULL" : histogram);
params.put("dataSize", String.valueOf(columnStatistic.dataSize));
StatisticsUtil.execUpdate(INSERT_INTO_COLUMN_STATISTICS, params);
Env.getCurrentEnv().getStatisticsCache().updateCache(objects.table.getId(), -1, colName, builder.build());
Histogram histogram = Env.getCurrentEnv().getStatisticsCache()
.getHistogram(objects.table.getId(), -1, colName);
Statistic statistic = new Statistic();
statistic.setHistogram(histogram);
statistic.setColumnStatistic(builder.build());
Env.getCurrentEnv().getStatisticsCache()
.updateCache(objects.table.getId(), -1, colName, statistic);
}
}

View File

@ -29,8 +29,10 @@ import org.apache.doris.analysis.StatementBase;
import org.apache.doris.analysis.StringLiteral;
import org.apache.doris.analysis.TableName;
import org.apache.doris.analysis.UserIdentity;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.DatabaseIf;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.TableIf;
@ -47,6 +49,7 @@ import org.apache.doris.qe.SessionVariable;
import org.apache.doris.qe.StmtExecutor;
import org.apache.doris.statistics.AnalysisTaskInfo;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.Histogram;
import org.apache.doris.statistics.StatisticConstants;
import org.apache.doris.statistics.util.InternalQueryResult.ResultRow;
import org.apache.doris.system.SystemInfoService;
@ -107,6 +110,11 @@ public class StatisticsUtil {
return resultBatches.stream().map(ColumnStatistic::fromResultRow).collect(Collectors.toList());
}
public static List<Histogram> deserializeToHistogramStatistics(List<ResultRow> resultBatches)
throws Exception {
return resultBatches.stream().map(Histogram::fromResultRow).collect(Collectors.toList());
}
public static AutoCloseConnectContext buildConnectContext() {
ConnectContext connectContext = new ConnectContext();
SessionVariable sessionVariable = connectContext.getSessionVariable();
@ -253,4 +261,26 @@ public class StatisticsUtil {
}
return new DBObjects(catalogIf, databaseIf, tableIf);
}
public static Column findColumn(long catalogId, long dbId, long tblId, long idxId, String columnName) {
CatalogIf<DatabaseIf<TableIf>> catalogIf = Env.getCurrentEnv().getCatalogMgr().getCatalog(catalogId);
if (catalogIf == null) {
return null;
}
DatabaseIf<TableIf> db = catalogIf.getDb(dbId).orElse(null);
if (db == null) {
return null;
}
TableIf tblIf = db.getTable(tblId).orElse(null);
if (tblIf == null) {
return null;
}
if (idxId != -1) {
if (tblIf instanceof OlapTable) {
OlapTable olapTable = (OlapTable) tblIf;
return olapTable.getIndexIdToMeta().get(idxId).getColumnByName(columnName);
}
}
return tblIf.getColumn(columnName);
}
}

View File

@ -478,6 +478,7 @@ import org.apache.doris.qe.SqlModeHelper;
keywordMap.put("write", new Integer(SqlParserSymbols.KW_WRITE));
keywordMap.put("year", new Integer(SqlParserSymbols.KW_YEAR));
keywordMap.put("mtmv", new Integer(SqlParserSymbols.KW_MTMV));
keywordMap.put("histogram", new Integer(SqlParserSymbols.KW_HISTOGRAM));
}
// map from token id to token description

View File

@ -191,7 +191,7 @@ public class HyperGraphBuilder {
int count = rowCounts.get(Integer.parseInt(scanPlan.getTable().getName()));
for (Slot slot : scanPlan.getOutput()) {
slotIdToColumnStats.put(slot.getExprId(),
new ColumnStatistic(count, count, 0, 0, 0, 0, 0, null, 0, null, null, true));
new ColumnStatistic(count, count, 0, 0, 0, 0, 0, 0, null, null, true));
}
StatsDeriveResult stats = new StatsDeriveResult(count, slotIdToColumnStats);
group.setStatistics(stats);

View File

@ -82,7 +82,7 @@ public class AnalysisJobTest extends TestWithFeService {
return connectContext;
}
};
String sql = "ANALYZE t1";
String sql = "ANALYZE TABLE t1";
Assertions.assertNotNull(getSqlStmtExecutor(sql));
}

View File

@ -32,6 +32,7 @@ import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
@ -63,14 +64,12 @@ public class CacheTest extends TestWithFeService {
@Test
public void testLoad() throws Exception {
new MockUp<ColumnStatistic>() {
new MockUp<StatisticsUtil>() {
@Mock
public Column findColumn(long catalogId, long dbId, long tblId, long idxId, String columnName) {
return new Column("abc", PrimitiveType.BIGINT);
}
};
new MockUp<StatisticsUtil>() {
@Mock
public List<ResultRow> execStatisticQuery(String sql) {
@ -91,7 +90,6 @@ public class CacheTest extends TestWithFeService {
colNames.add("col_id");
colNames.add("min");
colNames.add("max");
colNames.add("histogram");
List<PrimitiveType> primitiveTypes = new ArrayList<>();
primitiveTypes.add(PrimitiveType.BIGINT);
primitiveTypes.add(PrimitiveType.BIGINT);
@ -103,6 +101,7 @@ public class CacheTest extends TestWithFeService {
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
List<String> values = new ArrayList<>();
values.add("1");
values.add("2");
@ -115,7 +114,6 @@ public class CacheTest extends TestWithFeService {
values.add("8");
values.add("9");
values.add("10");
values.add("");
ResultRow resultRow = new ResultRow(colNames, primitiveTypes, values);
return Arrays.asList(resultRow);
}
@ -123,10 +121,78 @@ public class CacheTest extends TestWithFeService {
StatisticsCache statisticsCache = new StatisticsCache();
ColumnStatistic columnStatistic = statisticsCache.getColumnStatistics(0, "col");
Assertions.assertEquals(ColumnStatistic.DEFAULT, columnStatistic);
Thread.sleep(100);
Thread.sleep(1000);
columnStatistic = statisticsCache.getColumnStatistics(0, "col");
Assertions.assertEquals(1, columnStatistic.count);
Assertions.assertEquals(2, columnStatistic.ndv);
Assertions.assertEquals(10, columnStatistic.maxValue);
}
@Test
public void testLoadHistogram() throws Exception {
new MockUp<StatisticsUtil>() {
@Mock
public Column findColumn(long catalogId, long dbId, long tblId, long idxId, String columnName) {
return new Column("abc", PrimitiveType.DATETIME);
}
@Mock
public List<ResultRow> execStatisticQuery(String sql) {
try {
Thread.sleep(50);
} catch (InterruptedException e) {
// ignore
}
List<String> colNames = new ArrayList<>();
colNames.add("catalog_id");
colNames.add("db_id");
colNames.add("idx_id");
colNames.add("tbl_id");
colNames.add("col_id");
colNames.add("sample_rate");
colNames.add("buckets");
List<PrimitiveType> primitiveTypes = new ArrayList<>();
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
primitiveTypes.add(PrimitiveType.VARCHAR);
List<String> values = new ArrayList<>();
values.add("1");
values.add("2");
values.add("3");
values.add("-1");
values.add("4");
values.add("0.2");
String buckets = "{\"max_bucket_num\":128,\"bucket_num\":5,\"sample_rate\":1.0,\"buckets\":"
+ "[{\"lower\":\"2022-09-21 17:30:29\",\"upper\":\"2022-09-21 22:30:29\","
+ "\"count\":9,\"pre_sum\":0,\"ndv\":1},"
+ "{\"lower\":\"2022-09-22 17:30:29\",\"upper\":\"2022-09-22 22:30:29\","
+ "\"count\":10,\"pre_sum\":9,\"ndv\":1},"
+ "{\"lower\":\"2022-09-23 17:30:29\",\"upper\":\"2022-09-23 22:30:29\","
+ "\"count\":9,\"pre_sum\":19,\"ndv\":1},"
+ "{\"lower\":\"2022-09-24 17:30:29\",\"upper\":\"2022-09-24 22:30:29\","
+ "\"count\":9,\"pre_sum\":28,\"ndv\":1},"
+ "{\"lower\":\"2022-09-25 17:30:29\",\"upper\":\"2022-09-25 22:30:29\","
+ "\"count\":9,\"pre_sum\":37,\"ndv\":1}]}";
values.add(buckets);
ResultRow resultRow = new ResultRow(colNames, primitiveTypes, values);
return Collections.singletonList(resultRow);
}
};
StatisticsCache statisticsCache = new StatisticsCache();
Histogram histogram = statisticsCache.getHistogram(0, "col");
Assertions.assertEquals(Histogram.DEFAULT, histogram);
Thread.sleep(1000);
histogram = statisticsCache.getHistogram(0, "col");
Assertions.assertEquals("DATETIME", histogram.dataType.toString());
Assertions.assertEquals(128, histogram.maxBucketNum);
Assertions.assertEquals(5, histogram.bucketNum);
Assertions.assertEquals(0.2, histogram.sampleRate);
Assertions.assertEquals(5, histogram.buckets.size());
}
}

View File

@ -39,7 +39,7 @@ class HistogramTest {
@BeforeEach
void setUp() throws Exception {
String json = "{\"max_bucket_size\":128,\"bucket_size\":5,\"sample_rate\":1.0,\"buckets\":"
String json = "{\"max_bucket_num\":128,\"bucket_num\":5,\"sample_rate\":1.0,\"buckets\":"
+ "[{\"lower\":\"2022-09-21 17:30:29\",\"upper\":\"2022-09-21 22:30:29\","
+ "\"count\":9,\"pre_sum\":0,\"ndv\":1},"
+ "{\"lower\":\"2022-09-22 17:30:29\",\"upper\":\"2022-09-22 22:30:29\","
@ -58,19 +58,19 @@ class HistogramTest {
@Test
void testDeserializeFromJson() throws Exception {
Type dataType = histogramUnderTest.getDataType();
Type dataType = histogramUnderTest.dataType;
Assertions.assertTrue(dataType.isDatetime());
int maxBucketSize = histogramUnderTest.getMaxBucketSize();
int maxBucketSize = histogramUnderTest.maxBucketNum;
Assertions.assertEquals(128, maxBucketSize);
int bucketSize = histogramUnderTest.getBucketSize();
int bucketSize = histogramUnderTest.bucketNum;
Assertions.assertEquals(5, bucketSize);
float sampleRate = histogramUnderTest.getSampleRate();
double sampleRate = histogramUnderTest.sampleRate;
Assertions.assertEquals(1.0, sampleRate);
List<Bucket> buckets = histogramUnderTest.getBuckets();
List<Bucket> buckets = histogramUnderTest.buckets;
Assertions.assertEquals(5, buckets.size());
LiteralExpr expectedLower = LiteralExpr.create("2022-09-21 17:30:29",
@ -97,10 +97,10 @@ class HistogramTest {
String json = Histogram.serializeToJson(histogramUnderTest);
JSONObject histogramJson = JSON.parseObject(json);
int maxBucketSize = histogramJson.getIntValue("max_bucket_size");
int maxBucketSize = histogramJson.getIntValue("max_bucket_num");
Assertions.assertEquals(128, maxBucketSize);
int bucketSize = histogramJson.getIntValue("bucket_size");
int bucketSize = histogramJson.getIntValue("bucket_num");
Assertions.assertEquals(5, bucketSize);
float sampleRate = histogramJson.getFloat("sample_rate");

View File

@ -79,7 +79,7 @@ public class MVStatisticsTest extends TestWithFeService {
};
AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager();
Deencapsulation.setField(analysisManager, "statisticsCache", statisticsCache);
getSqlStmtExecutor("analyze t1");
getSqlStmtExecutor("analyze table t1");
Thread.sleep(3000);
}
}

View File

@ -27,7 +27,7 @@ public class StatsDeriveResultTest {
public void testUpdateRowCountByLimit() {
StatsDeriveResult stats = new StatsDeriveResult(100);
ColumnStatistic a = new ColumnStatistic(100, 10, 1, 5, 10,
1, 100, null, 0.5, null, null, false);
1, 100, 0.5, null, null, false);
Id id = new Id(1);
stats.addColumnStats(id, a);
StatsDeriveResult res = stats.updateByLimit(0);