[enchancement](statistics) support sampling collection of statistics (#18880)

1. Supports sampling to collect statistics
2. Improved syntax for collecting statistics
3. Support histogram specifies the number of buckets
4. Tweaked some code structure

---

The syntax supports WITH and PROPERTIES, using the same syntax as before.

Column Statistics Collection Syntax:
```SQL
ANALYZE [ SYNC ] TABLE table_name
     [ (column_name [, ...]) ]
     [ [WITH SYNC] | [WITH INCREMENTAL] | [WITH SAMPLE PERCENT | ROWS ] ]
     [ PROPERTIES ('key' = 'value', ...) ];
```

Column histogram collection syntax:
```SQL
ANALYZE [ SYNC ] TABLE table_name
     [ (column_name [, ...]) ]
     UPDATE HISTOGRAM
     [ [ WITH SYNC ][ WITH INCREMENTAL ][ WITH SAMPLE PERCENT | ROWS ][ WITH BUCKETS ] ]
     [ PROPERTIES ('key' = 'value', ...) ];
```

Illustrate:
- sync:Collect statistics synchronously. Return after collecting.
- incremental:Collect statistics incrementally. Incremental collection of histogram statistics is not supported.
- sample percent | rows:Collect statistics by sampling. Scale and number of rows can be sampled.
- buckets:Specifies the maximum number of buckets generated when collecting histogram statistics.
- table_name: The purpose table for collecting statistics. Can be of the form `db_name.table_name`.
- column_name: The specified destination column must be a column that exists in `table_name`, and multiple column names are separated by commas.
- properties:Properties used to set statistics tasks. Currently only the following configurations are supported (equivalent to the with statement)
   - 'sync' = 'true'
   - 'incremental' = 'true'
   - 'sample.percent' = '50'
   - 'sample.rows' = '1000'
   - 'num.buckets' = 10

--- 

TODO: 
- Supplement the complete p0 test
- `Incremental` statistics see #18653
This commit is contained in:
ElvinWei
2023-04-21 13:11:43 +08:00
committed by GitHub
parent ae76b59f2f
commit 1a6401d682
16 changed files with 575 additions and 177 deletions

View File

@ -540,6 +540,7 @@ terminal String
KW_ROW,
KW_ROWS,
KW_S3,
KW_SAMPLE,
KW_SCHEMA,
KW_SCHEMAS,
KW_SECOND,
@ -890,6 +891,10 @@ nonterminal PauseSyncJobStmt pause_sync_job_stmt;
nonterminal StopSyncJobStmt stop_sync_job_stmt;
nonterminal JobName job_name;
// analyze
nonterminal Map<String, String> with_analysis_properties;
nonterminal List<Map<String, String>> opt_with_analysis_properties;
nonterminal String opt_db, procedure_or_function, opt_comment, opt_engine;
nonterminal ColumnDef.DefaultValue opt_default_value;
nonterminal Boolean opt_if_exists, opt_if_not_exists;
@ -2812,30 +2817,45 @@ show_create_reporitory_stmt ::=
// analyze statment
analyze_stmt ::=
KW_ANALYZE opt_sync:sync KW_TABLE table_name:tbl opt_col_list:cols opt_properties:properties
// statistics
KW_ANALYZE opt_sync:sync KW_TABLE table_name:tbl opt_col_list:cols
opt_with_analysis_properties:withAnalysisProperties opt_properties:properties
{:
boolean is_whole_tbl = (cols == null);
boolean is_histogram = false;
boolean is_increment = false;
RESULT = new AnalyzeStmt(tbl, sync, cols, properties, is_whole_tbl, is_histogram, is_increment);
if (properties == null) {
properties = Maps.newHashMap();
}
for (Map<String, String> property : withAnalysisProperties) {
properties.putAll(property);
}
if (!properties.containsKey("sync")) {
properties.put("sync", String.valueOf(sync));
}
// Rule: If no type is specified, see if there is a specified column
if (!properties.containsKey("analysis.type")) {
if ((cols == null)) {
properties.put("analysis.type", "INDEX");
} else {
properties.put("analysis.type", "COLUMN");
}
}
RESULT = new AnalyzeStmt(tbl, cols, properties);
:}
| KW_ANALYZE opt_sync:sync KW_INCREMENTAL KW_TABLE table_name:tbl opt_col_list:cols opt_partition_names:partitionNames opt_properties:properties
// histogram
| KW_ANALYZE opt_sync:sync KW_TABLE table_name:tbl opt_col_list:cols KW_UPDATE KW_HISTOGRAM
opt_with_analysis_properties:withAnalysisProperties opt_properties:properties
{:
boolean is_whole_tbl = (cols == null);
boolean is_histogram = false;
boolean is_increment = true;
RESULT = new AnalyzeStmt(tbl, sync, cols, properties, is_whole_tbl, is_histogram, is_increment);
:}
| KW_ANALYZE opt_sync:sync KW_TABLE table_name:tbl KW_UPDATE KW_HISTOGRAM KW_ON ident_list:cols opt_partition_names:partitionNames opt_properties:properties
{:
boolean is_whole_tbl = false;
boolean is_histogram = true;
boolean is_increment = false;
RESULT = new AnalyzeStmt(tbl, sync, cols, properties, is_whole_tbl, is_histogram, is_increment);
:}
| KW_ANALYZE opt_sync:sync KW_TABLE table_name:tbl KW_UPDATE KW_HISTOGRAM
{:
RESULT = new AnalyzeStmt(tbl, sync, null, new HashMap<>(), true, true, false);
if (properties == null) {
properties = Maps.newHashMap();
}
for (Map<String, String> property : withAnalysisProperties) {
properties.putAll(property);
}
if (!properties.containsKey("sync")) {
properties.put("sync", String.valueOf(sync));
}
// TODO: Support materialized view
properties.put("analysis.type", "HISTOGRAM");
RESULT = new AnalyzeStmt(tbl, cols, properties);
:}
;
@ -5730,6 +5750,53 @@ ident_list ::=
:}
;
with_analysis_properties ::=
KW_SYNC
{:
RESULT = new HashMap<String, String>() {{
put("sync", "true");
}};
:}
| KW_INCREMENTAL
{:
RESULT = new HashMap<String, String>() {{
put("incremental", "true");
}};
:}
| KW_SAMPLE KW_PERCENT INTEGER_LITERAL:samplePercent
{:
RESULT = new HashMap<String, String>() {{
put("sample.percent", String.valueOf(samplePercent.intValue()));
}};
:}
| KW_SAMPLE KW_ROWS INTEGER_LITERAL:sampleRows
{:
RESULT = new HashMap<String, String>() {{
put("sample.rows", String.valueOf(sampleRows.intValue()));
}};
:}
| KW_BUCKETS INTEGER_LITERAL:numBuckets
{:
RESULT = new HashMap<String, String>() {{
put("num.buckets", String.valueOf(numBuckets.intValue()));
}};
:}
;
opt_with_analysis_properties ::=
/* empty */
{:
RESULT = Lists.newArrayList();
:}
| opt_with_analysis_properties:withAnalysisProperties
KW_WITH with_analysis_properties:property
{:
withAnalysisProperties.add(property);
RESULT = withAnalysisProperties;
:}
;
expr_list ::=
expr:e
{:
@ -7368,6 +7435,8 @@ keyword ::=
{: RESULT = id; :}
| KW_EXPIRED: id
{: RESULT = id; :}
| KW_SAMPLE:id
{: RESULT = id; :}
;
// Identifier that contain keyword

View File

@ -21,22 +21,22 @@ import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.DatabaseIf;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.catalog.View;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
import org.apache.doris.common.ErrorCode;
import org.apache.doris.common.ErrorReport;
import org.apache.doris.common.FeNameFormat;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.PrintableMap;
import org.apache.doris.common.util.Util;
import org.apache.doris.datasource.CatalogIf;
import org.apache.doris.mysql.privilege.PrivPredicate;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisMethod;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisType;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
@ -44,35 +44,57 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
/**
* Collect statistics.
*
* syntax:
* ANALYZE [[ db_name.tb_name ] [( column_name [, ...] )], ...] [ PROPERTIES(...) ]
* db_name.tb_name: collect table and column statistics from tb_name
* column_name: collect column statistics from column_name
* properties: properties of statistics jobs
* Column Statistics Collection Syntax:
* ANALYZE [ SYNC ] TABLE table_name
* [ (column_name [, ...]) ]
* [ [WITH SYNC] | [WITH INCREMENTAL] | [WITH SAMPLE PERCENT | ROWS ] ]
* [ PROPERTIES ('key' = 'value', ...) ];
*
* Column histogram collection syntax:
* ANALYZE [ SYNC ] TABLE table_name
* [ (column_name [, ...]) ]
* UPDATE HISTOGRAM
* [ [ WITH SYNC ][ WITH INCREMENTAL ][ WITH SAMPLE PERCENT | ROWS ][ WITH BUCKETS ] ]
* [ PROPERTIES ('key' = 'value', ...) ];
*
* Illustrate:
* - sync:Collect statistics synchronously. Return after collecting.
* - incremental:Collect statistics incrementally. Incremental collection of histogram statistics is not supported.
* - sample percent | rows:Collect statistics by sampling. Scale and number of rows can be sampled.
* - buckets:Specifies the maximum number of buckets generated when collecting histogram statistics.
* - table_name: The purpose table for collecting statistics. Can be of the form `db_name.table_name`.
* - column_name: The specified destination column must be a column that exists in `table_name`,
* and multiple column names are separated by commas.
* - properties:Properties used to set statistics tasks. Currently only the following configurations
* are supported (equivalent to the with statement)
* - 'sync' = 'true'
* - 'incremental' = 'true'
* - 'sample.percent' = '50'
* - 'sample.rows' = '1000'
* - 'num.buckets' = 10
*/
public class AnalyzeStmt extends DdlStmt {
// time to wait for collect statistics
public static final String CBO_STATISTICS_TASK_TIMEOUT_SEC = "cbo_statistics_task_timeout_sec";
// The properties passed in by the user through "with" or "properties('K', 'V')"
public static final String PROPERTY_SYNC = "sync";
public static final String PROPERTY_INCREMENTAL = "incremental";
public static final String PROPERTY_SAMPLE_PERCENT = "sample.percent";
public static final String PROPERTY_SAMPLE_ROWS = "sample.rows";
public static final String PROPERTY_NUM_BUCKETS = "num.buckets";
public static final String PROPERTY_ANALYSIS_TYPE = "analysis.type";
private static final ImmutableSet<String> PROPERTIES_SET = new ImmutableSet.Builder<String>()
.add(CBO_STATISTICS_TASK_TIMEOUT_SEC)
.add(PROPERTY_SYNC)
.add(PROPERTY_INCREMENTAL)
.add(PROPERTY_SAMPLE_PERCENT)
.add(PROPERTY_SAMPLE_ROWS)
.add(PROPERTY_NUM_BUCKETS)
.add(PROPERTY_ANALYSIS_TYPE)
.build();
private static final Predicate<Long> DESIRED_TASK_TIMEOUT_SEC = (v) -> v > 0L;
public boolean isWholeTbl;
public boolean isHistogram;
public boolean isIncrement;
private final TableName tableName;
private final boolean sync;
private final List<String> columnNames;
private final Map<String, String> properties;
@ -81,19 +103,11 @@ public class AnalyzeStmt extends DdlStmt {
private TableIf table;
public AnalyzeStmt(TableName tableName,
boolean sync,
List<String> columnNames,
Map<String, String> properties,
Boolean isWholeTbl,
Boolean isHistogram,
Boolean isIncrement) {
List<String> columnNames,
Map<String, String> properties) {
this.tableName = tableName;
this.sync = sync;
this.columnNames = columnNames;
this.properties = properties;
this.isWholeTbl = isWholeTbl;
this.isHistogram = isHistogram;
this.isIncrement = isIncrement;
}
@Override
@ -133,6 +147,15 @@ public class AnalyzeStmt extends DdlStmt {
}
checkProperties();
// TODO support external table
if (properties.containsKey(PROPERTY_SAMPLE_PERCENT)
|| properties.containsKey(PROPERTY_SAMPLE_ROWS)) {
if (!(table instanceof OlapTable)) {
throw new AnalysisException("Sampling statistics "
+ "collection of external tables is not supported");
}
}
}
@Override
@ -153,18 +176,88 @@ public class AnalyzeStmt extends DdlStmt {
}
private void checkProperties() throws UserException {
if (properties != null) {
Optional<String> optional = properties.keySet().stream().filter(
entity -> !PROPERTIES_SET.contains(entity)).findFirst();
if (optional.isPresent()) {
throw new AnalysisException(optional.get() + " is invalid property");
}
if (properties == null || properties.isEmpty()) {
throw new AnalysisException("analysis properties should not be empty");
}
long taskTimeout = ((Long) Util.getLongPropertyOrDefault(
properties.get(CBO_STATISTICS_TASK_TIMEOUT_SEC),
Config.max_cbo_statistics_task_timeout_sec, DESIRED_TASK_TIMEOUT_SEC,
CBO_STATISTICS_TASK_TIMEOUT_SEC + " should > 0")).intValue();
properties.put(CBO_STATISTICS_TASK_TIMEOUT_SEC, String.valueOf(taskTimeout));
String msgTemplate = "%s = %s is invalid property";
Optional<String> optional = properties.keySet().stream().filter(
entity -> !PROPERTIES_SET.contains(entity)).findFirst();
if (optional.isPresent()) {
String msg = String.format(msgTemplate, optional.get(), properties.get(optional.get()));
throw new AnalysisException(msg);
}
if (properties.containsKey(PROPERTY_SYNC)) {
try {
Boolean.valueOf(properties.get(PROPERTY_SYNC));
} catch (NumberFormatException e) {
String msg = String.format(msgTemplate, PROPERTY_SYNC, properties.get(PROPERTY_SYNC));
throw new AnalysisException(msg);
}
}
if (properties.containsKey(PROPERTY_INCREMENTAL)) {
try {
Boolean.valueOf(properties.get(PROPERTY_INCREMENTAL));
} catch (NumberFormatException e) {
String msg = String.format(msgTemplate, PROPERTY_INCREMENTAL, properties.get(PROPERTY_INCREMENTAL));
throw new AnalysisException(msg);
}
}
if (properties.containsKey(PROPERTY_SAMPLE_PERCENT)
&& properties.containsKey(PROPERTY_SAMPLE_ROWS)) {
throw new AnalysisException("only one sampling parameter can be specified simultaneously");
}
if (properties.containsKey(PROPERTY_SAMPLE_PERCENT)) {
checkNumericProperty(PROPERTY_SAMPLE_PERCENT, properties.get(PROPERTY_SAMPLE_PERCENT),
0, 100, false, "should be > 0 and < 100");
}
if (properties.containsKey(PROPERTY_SAMPLE_ROWS)) {
checkNumericProperty(PROPERTY_SAMPLE_ROWS, properties.get(PROPERTY_SAMPLE_ROWS),
0, Integer.MAX_VALUE, false, "needs at least 1 row");
}
if (properties.containsKey(PROPERTY_NUM_BUCKETS)) {
checkNumericProperty(PROPERTY_NUM_BUCKETS, properties.get(PROPERTY_NUM_BUCKETS),
1, Integer.MAX_VALUE, true, "needs at least 1 buckets");
}
if (properties.containsKey(PROPERTY_ANALYSIS_TYPE)) {
try {
AnalysisType.valueOf(properties.get(PROPERTY_ANALYSIS_TYPE));
} catch (NumberFormatException e) {
String msg = String.format(msgTemplate, PROPERTY_ANALYSIS_TYPE, properties.get(PROPERTY_ANALYSIS_TYPE));
throw new AnalysisException(msg);
}
}
if (properties.containsKey(PROPERTY_INCREMENTAL)
&& AnalysisType.valueOf(properties.get(PROPERTY_ANALYSIS_TYPE)) == AnalysisType.HISTOGRAM) {
throw new AnalysisException(PROPERTY_INCREMENTAL + " collection of histograms is not supported");
}
if (properties.containsKey(PROPERTY_NUM_BUCKETS)
&& AnalysisType.valueOf(properties.get(PROPERTY_ANALYSIS_TYPE)) != AnalysisType.HISTOGRAM) {
throw new AnalysisException(PROPERTY_NUM_BUCKETS + " can only be specified when collecting histograms");
}
}
private void checkNumericProperty(String key, String value, int lowerBound, int upperBound,
boolean includeBoundary, String errorMsg) throws AnalysisException {
if (!StringUtils.isNumeric(value)) {
String msg = String.format("%s = %s is an invalid property.", key, value);
throw new AnalysisException(msg);
}
int intValue = Integer.parseInt(value);
boolean isOutOfBounds = (includeBoundary && (intValue < lowerBound || intValue > upperBound))
|| (!includeBoundary && (intValue <= lowerBound || intValue >= upperBound));
if (isOutOfBounds) {
throw new AnalysisException(key + " " + errorMsg);
}
}
@ -198,8 +291,47 @@ public class AnalyzeStmt extends DdlStmt {
}
public Map<String, String> getProperties() {
// TODO add default properties
return properties != null ? properties : Maps.newHashMap();
return properties;
}
public boolean isSync() {
return Boolean.parseBoolean(properties.get(PROPERTY_SYNC));
}
public boolean isIncremental() {
return Boolean.parseBoolean(properties.get(PROPERTY_INCREMENTAL));
}
public int getSamplePercent() {
if (!properties.containsKey(PROPERTY_SAMPLE_PERCENT)) {
return 0;
}
return Integer.parseInt(properties.get(PROPERTY_SAMPLE_PERCENT));
}
public int getSampleRows() {
if (!properties.containsKey(PROPERTY_SAMPLE_ROWS)) {
return 0;
}
return Integer.parseInt(properties.get(PROPERTY_SAMPLE_ROWS));
}
public int getNumBuckets() {
if (!properties.containsKey(PROPERTY_NUM_BUCKETS)) {
return 0;
}
return Integer.parseInt(properties.get(PROPERTY_NUM_BUCKETS));
}
public AnalysisType getAnalysisType() {
return AnalysisType.valueOf(properties.get(PROPERTY_ANALYSIS_TYPE));
}
public AnalysisMethod getAnalysisMethod() {
if (getSamplePercent() > 0 || getSampleRows() > 0) {
return AnalysisMethod.SAMPLE;
}
return AnalysisMethod.FULL;
}
@Override
@ -207,27 +339,22 @@ public class AnalyzeStmt extends DdlStmt {
StringBuilder sb = new StringBuilder();
sb.append("ANALYZE");
if (isIncrement) {
sb.append(" ");
sb.append("INCREMENTAL");
}
if (tableName != null) {
sb.append(" ");
sb.append(tableName.toSql());
}
if (isHistogram) {
sb.append(" ");
sb.append("UPDATE HISTOGRAM ON");
sb.append(" ");
sb.append(StringUtils.join(columnNames, ","));
} else if (columnNames != null) {
if (columnNames != null) {
sb.append("(");
sb.append(StringUtils.join(columnNames, ","));
sb.append(")");
}
if (getAnalysisType().equals(AnalysisType.HISTOGRAM)) {
sb.append(" ");
sb.append("UPDATE HISTOGRAM");
}
if (properties != null) {
sb.append(" ");
sb.append("PROPERTIES(");
@ -239,8 +366,4 @@ public class AnalyzeStmt extends DdlStmt {
return sb.toString();
}
public boolean isSync() {
return sync;
}
}

View File

@ -90,19 +90,17 @@ public class AnalysisManager {
}
// Each analyze stmt corresponding to an analysis job.
public void createAnalysisJob(AnalyzeStmt analyzeStmt) throws DdlException {
String catalogName = analyzeStmt.getCatalogName();
String db = analyzeStmt.getDBName();
TableName tbl = analyzeStmt.getTblName();
StatisticsUtil.convertTableNameToObjects(tbl);
Set<String> colNames = analyzeStmt.getColumnNames();
Map<Long, AnalysisTaskInfo> analysisTaskInfos = new HashMap<>();
public void createAnalysisJob(AnalyzeStmt stmt) throws DdlException {
long jobId = Env.getCurrentEnv().getNextId();
createTaskForEachColumns(analyzeStmt, catalogName, db, tbl, colNames, analysisTaskInfos, jobId);
createTaskForMVIdx(analyzeStmt, catalogName, db, tbl, analysisTaskInfos, jobId);
persistAnalysisJob(catalogName, db, tbl, jobId);
AnalysisTaskInfoBuilder taskInfoBuilder = buildCommonTaskInfo(stmt, jobId);
Map<Long, AnalysisTaskInfo> analysisTaskInfos = new HashMap<>();
if (analyzeStmt.isSync()) {
// start build analysis tasks
createTaskForEachColumns(stmt.getColumnNames(), taskInfoBuilder, analysisTaskInfos);
createTaskForMVIdx(stmt.getTable(), taskInfoBuilder, analysisTaskInfos, stmt.getAnalysisType());
persistAnalysisJob(taskInfoBuilder);
if (stmt.isSync()) {
syncExecute(analysisTaskInfos.values());
return;
}
@ -111,42 +109,79 @@ public class AnalysisManager {
analysisTaskInfos.values().forEach(taskScheduler::schedule);
}
private void persistAnalysisJob(String catalogName, String db, TableName tbl,
long jobId) throws DdlException {
private AnalysisTaskInfoBuilder buildCommonTaskInfo(AnalyzeStmt stmt, long jobId) {
AnalysisTaskInfoBuilder taskInfoBuilder = new AnalysisTaskInfoBuilder();
String catalogName = stmt.getCatalogName();
String db = stmt.getDBName();
TableName tbl = stmt.getTblName();
StatisticsUtil.convertTableNameToObjects(tbl);
String tblName = tbl.getTbl();
int samplePercent = stmt.getSamplePercent();
int sampleRows = stmt.getSampleRows();
AnalysisType analysisType = stmt.getAnalysisType();
AnalysisMethod analysisMethod = stmt.getAnalysisMethod();
taskInfoBuilder.setJobId(jobId);
taskInfoBuilder.setCatalogName(catalogName);
taskInfoBuilder.setDbName(db);
taskInfoBuilder.setTblName(tblName);
taskInfoBuilder.setJobType(JobType.MANUAL);
taskInfoBuilder.setState(AnalysisState.PENDING);
taskInfoBuilder.setScheduleType(ScheduleType.ONCE);
if (analysisMethod == AnalysisMethod.SAMPLE) {
taskInfoBuilder.setAnalysisMethod(AnalysisMethod.SAMPLE);
taskInfoBuilder.setSamplePercent(samplePercent);
taskInfoBuilder.setSampleRows(sampleRows);
} else {
taskInfoBuilder.setAnalysisMethod(AnalysisMethod.FULL);
}
if (analysisType == AnalysisType.HISTOGRAM) {
taskInfoBuilder.setAnalysisType(AnalysisType.HISTOGRAM);
int numBuckets = stmt.getNumBuckets();
int maxBucketNum = numBuckets > 0 ? numBuckets
: StatisticConstants.HISTOGRAM_MAX_BUCKET_NUM;
taskInfoBuilder.setMaxBucketNum(maxBucketNum);
} else {
taskInfoBuilder.setAnalysisType(AnalysisType.COLUMN);
}
return taskInfoBuilder;
}
private void persistAnalysisJob(AnalysisTaskInfoBuilder taskInfoBuilder) throws DdlException {
try {
AnalysisTaskInfo analysisTaskInfo = new AnalysisTaskInfoBuilder().setJobId(
jobId).setTaskId(-1)
.setCatalogName(catalogName).setDbName(db)
.setTblName(tbl.getTbl())
.setJobType(JobType.MANUAL)
.setAnalysisMethod(AnalysisMethod.FULL).setAnalysisType(AnalysisType.INDEX)
.setScheduleType(ScheduleType.ONCE).build();
AnalysisTaskInfoBuilder jobInfoBuilder = taskInfoBuilder.copy();
AnalysisTaskInfo analysisTaskInfo = jobInfoBuilder.setTaskId(-1).build();
StatisticsRepository.persistAnalysisTask(analysisTaskInfo);
} catch (Throwable t) {
throw new DdlException(t.getMessage(), t);
}
}
private void createTaskForMVIdx(AnalyzeStmt analyzeStmt, String catalogName, String db, TableName tbl,
Map<Long, AnalysisTaskInfo> analysisTaskInfos, long jobId) throws DdlException {
if (!(analyzeStmt.isWholeTbl && analyzeStmt.getTable().getType().equals(TableType.OLAP))) {
private void createTaskForMVIdx(TableIf table, AnalysisTaskInfoBuilder taskInfoBuilder,
Map<Long, AnalysisTaskInfo> analysisTaskInfos, AnalysisType analysisType) throws DdlException {
TableType type = table.getType();
if (analysisType != AnalysisType.INDEX || !type.equals(TableType.OLAP)) {
// not need to collect statistics for materialized view
return;
}
OlapTable olapTable = (OlapTable) analyzeStmt.getTable();
taskInfoBuilder.setAnalysisType(analysisType);
OlapTable olapTable = (OlapTable) table;
try {
olapTable.readLock();
for (MaterializedIndexMeta meta : olapTable.getIndexIdToMeta().values()) {
if (meta.getDefineStmt() == null) {
continue;
}
AnalysisTaskInfoBuilder indexTaskInfoBuilder = taskInfoBuilder.copy();
long indexId = meta.getIndexId();
long taskId = Env.getCurrentEnv().getNextId();
AnalysisTaskInfo analysisTaskInfo = new AnalysisTaskInfoBuilder().setJobId(
jobId).setTaskId(taskId)
.setCatalogName(catalogName).setDbName(db)
.setTblName(tbl.getTbl())
.setIndexId(meta.getIndexId()).setJobType(JobType.MANUAL)
.setAnalysisMethod(AnalysisMethod.FULL).setAnalysisType(AnalysisType.INDEX)
.setScheduleType(ScheduleType.ONCE).build();
AnalysisTaskInfo analysisTaskInfo = indexTaskInfoBuilder.setIndexId(indexId)
.setTaskId(taskId).build();
try {
StatisticsRepository.persistAnalysisTask(analysisTaskInfo);
} catch (Exception e) {
@ -159,19 +194,14 @@ public class AnalysisManager {
}
}
private void createTaskForEachColumns(AnalyzeStmt analyzeStmt, String catalogName, String db, TableName tbl,
Set<String> colNames, Map<Long, AnalysisTaskInfo> analysisTaskInfos,
long jobId) throws DdlException {
private void createTaskForEachColumns(Set<String> colNames, AnalysisTaskInfoBuilder taskInfoBuilder,
Map<Long, AnalysisTaskInfo> analysisTaskInfos) throws DdlException {
for (String colName : colNames) {
AnalysisTaskInfoBuilder colTaskInfoBuilder = taskInfoBuilder.copy();
long indexId = -1;
long taskId = Env.getCurrentEnv().getNextId();
AnalysisType analType = analyzeStmt.isHistogram ? AnalysisType.HISTOGRAM : AnalysisType.COLUMN;
AnalysisTaskInfo analysisTaskInfo = new AnalysisTaskInfoBuilder().setJobId(jobId)
.setTaskId(taskId).setCatalogName(catalogName).setDbName(db)
.setTblName(tbl.getTbl()).setColName(colName)
.setJobType(JobType.MANUAL)
.setAnalysisMethod(AnalysisMethod.FULL).setAnalysisType(analType)
.setState(AnalysisState.PENDING)
.setScheduleType(ScheduleType.ONCE).build();
AnalysisTaskInfo analysisTaskInfo = colTaskInfoBuilder.setColName(colName)
.setIndexId(indexId).setTaskId(taskId).build();
try {
StatisticsRepository.persistAnalysisTask(analysisTaskInfo);
} catch (Exception e) {

View File

@ -70,9 +70,11 @@ public class AnalysisTaskInfo {
public final AnalysisType analysisType;
// TODO: define constants or get them from configuration properties
public final double sampleRate = 1.0;
public final int maxBucketNum = 128;
public final int samplePercent;
public final int sampleRows;
public final int maxBucketNum;
public String message;
@ -84,9 +86,9 @@ public class AnalysisTaskInfo {
public final ScheduleType scheduleType;
public AnalysisTaskInfo(long jobId, long taskId, String catalogName, String dbName, String tblName,
String colName, Long indexId, JobType jobType,
AnalysisMethod analysisMethod, AnalysisType analysisType, String message,
int lastExecTimeInMs, AnalysisState state, ScheduleType scheduleType) {
String colName, Long indexId, JobType jobType, AnalysisMethod analysisMethod,
AnalysisType analysisType, int samplePercent, int sampleRows, int maxBucketNum,
String message, int lastExecTimeInMs, AnalysisState state, ScheduleType scheduleType) {
this.jobId = jobId;
this.taskId = taskId;
this.catalogName = catalogName;
@ -97,6 +99,9 @@ public class AnalysisTaskInfo {
this.jobType = jobType;
this.analysisMethod = analysisMethod;
this.analysisType = analysisType;
this.samplePercent = samplePercent;
this.sampleRows = sampleRows;
this.maxBucketNum = maxBucketNum;
this.message = message;
this.lastExecTimeInMs = lastExecTimeInMs;
this.state = state;

View File

@ -33,6 +33,9 @@ public class AnalysisTaskInfoBuilder {
private JobType jobType;
private AnalysisMethod analysisMethod;
private AnalysisType analysisType;
private int maxBucketNum;
private int samplePercent;
private int sampleRows;
private String message;
private int lastExecTimeInMs;
private AnalysisState state;
@ -88,6 +91,21 @@ public class AnalysisTaskInfoBuilder {
return this;
}
public AnalysisTaskInfoBuilder setMaxBucketNum(int maxBucketNum) {
this.maxBucketNum = maxBucketNum;
return this;
}
public AnalysisTaskInfoBuilder setSamplePercent(int samplePercent) {
this.samplePercent = samplePercent;
return this;
}
public AnalysisTaskInfoBuilder setSampleRows(int sampleRows) {
this.sampleRows = sampleRows;
return this;
}
public AnalysisTaskInfoBuilder setMessage(String message) {
this.message = message;
return this;
@ -109,7 +127,29 @@ public class AnalysisTaskInfoBuilder {
}
public AnalysisTaskInfo build() {
return new AnalysisTaskInfo(jobId, taskId, catalogName, dbName, tblName, colName,
indexId, jobType, analysisMethod, analysisType, message, lastExecTimeInMs, state, scheduleType);
return new AnalysisTaskInfo(jobId, taskId, catalogName, dbName, tblName,
colName, indexId, jobType, analysisMethod, analysisType, samplePercent,
sampleRows, maxBucketNum, message, lastExecTimeInMs, state, scheduleType);
}
public AnalysisTaskInfoBuilder copy() {
return new AnalysisTaskInfoBuilder()
.setJobId(jobId)
.setTaskId(taskId)
.setCatalogName(catalogName)
.setDbName(dbName)
.setTblName(tblName)
.setColName(colName)
.setIndexId(indexId)
.setJobType(jobType)
.setAnalysisMethod(analysisMethod)
.setAnalysisType(analysisType)
.setSamplePercent(samplePercent)
.setSampleRows(sampleRows)
.setMaxBucketNum(maxBucketNum)
.setMessage(message)
.setLastExecTimeInMs(lastExecTimeInMs)
.setState(state)
.setScheduleType(scheduleType);
}
}

View File

@ -24,6 +24,7 @@ import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.datasource.CatalogIf;
import org.apache.doris.qe.StmtExecutor;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisMethod;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisType;
import com.google.common.annotations.VisibleForTesting;
@ -189,4 +190,15 @@ public abstract class BaseAnalysisTask {
return unsupportedType.contains(type);
}
protected String getSampleExpression() {
if (info.analysisMethod == AnalysisMethod.FULL) {
return "";
}
// TODO Add sampling methods for external tables
if (info.samplePercent > 0) {
return String.format("TABLESAMPLE(%d PERCENT)", info.samplePercent);
} else {
return String.format("TABLESAMPLE(%d ROWS)", info.sampleRows);
}
}
}

View File

@ -19,8 +19,7 @@ package org.apache.doris.statistics;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.FeConstants;
import org.apache.doris.qe.AutoCloseConnectContext;
import org.apache.doris.qe.StmtExecutor;
import org.apache.doris.statistics.AnalysisTaskInfo.AnalysisMethod;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.annotations.VisibleForTesting;
@ -28,8 +27,6 @@ import org.apache.commons.text.StringSubstitutor;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Each task analyze one column.
@ -49,10 +46,7 @@ public class HistogramTask extends BaseAnalysisTask {
+ " HISTOGRAM(`${colName}`, ${maxBucketNum}) AS buckets, "
+ " NOW() AS create_time "
+ "FROM "
+ " `${dbName}`.`${tblName}`";
private static final String ANALYZE_HISTOGRAM_SQL_TEMPLATE_PART = ANALYZE_HISTOGRAM_SQL_TEMPLATE_TABLE
+ " PARTITION (${partName})";
+ " `${dbName}`.`${tblName}` ${sampleExpr}";
@VisibleForTesting
public HistogramTask() {
@ -71,44 +65,29 @@ public class HistogramTask extends BaseAnalysisTask {
params.put("catalogId", String.valueOf(catalog.getId()));
params.put("dbId", String.valueOf(db.getId()));
params.put("tblId", String.valueOf(tbl.getId()));
params.put("idxId", "-1");
params.put("idxId", String.valueOf(info.indexId));
params.put("colId", String.valueOf(info.colName));
params.put("dbName", info.dbName);
params.put("tblName", String.valueOf(info.tblName));
params.put("colName", String.valueOf(info.colName));
params.put("sampleRate", String.valueOf(info.sampleRate));
params.put("sampleRate", getSampleRateFunction());
params.put("sampleExpr", getSampleExpression());
params.put("maxBucketNum", String.valueOf(info.maxBucketNum));
params.put("percentValue", String.valueOf((int) (info.sampleRate * 100)));
String histogramSql;
Set<String> partitionNames = tbl.getPartitionNames();
if (partitionNames.isEmpty()) {
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
histogramSql = stringSubstitutor.replace(ANALYZE_HISTOGRAM_SQL_TEMPLATE_TABLE);
} else {
try {
tbl.readLock();
String partNames = partitionNames.stream()
.filter(x -> tbl.getPartition(x) != null)
.map(partName -> "`" + partName + "`")
.collect(Collectors.joining(","));
params.put("partName", partNames);
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
histogramSql = stringSubstitutor.replace(ANALYZE_HISTOGRAM_SQL_TEMPLATE_PART);
} finally {
tbl.readUnlock();
}
}
LOG.info("SQL to collect the histogram:\n {}", histogramSql);
try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) {
r.connectContext.getSessionVariable().disableNereidsPlannerOnce();
this.stmtExecutor = new StmtExecutor(r.connectContext, histogramSql);
this.stmtExecutor.execute();
}
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
StatisticsUtil.execUpdate(stringSubstitutor.replace(ANALYZE_HISTOGRAM_SQL_TEMPLATE_TABLE));
Env.getCurrentEnv().getStatisticsCache().refreshHistogramSync(tbl.getId(), -1, col.getName());
}
private String getSampleRateFunction() {
if (info.analysisMethod == AnalysisMethod.FULL) {
return "0";
}
if (info.samplePercent > 0) {
return String.valueOf(info.samplePercent / 100.0);
} else {
double sampRate = (double) info.sampleRows / tbl.getRowCount();
return sampRate >= 1 ? "1.0" : String.format("%.4f", sampRate);
}
}
}

View File

@ -50,11 +50,11 @@ import java.util.Map;
public class MVAnalysisTask extends BaseAnalysisTask {
private static final String ANALYZE_MV_PART = INSERT_PART_STATISTICS
+ " FROM (${sql}) mv";
+ " FROM (${sql}) mv ${sampleExpr}";
private static final String ANALYZE_MV_COL = INSERT_COL_STATISTICS
+ " (SELECT NDV(`${colName}`) AS ndv "
+ " FROM (${sql}) mv) t2\n";
+ " FROM (${sql}) mv) t2";
private MaterializedIndexMeta meta;
@ -118,9 +118,11 @@ public class MVAnalysisTask extends BaseAnalysisTask {
params.put("colName", colName);
params.put("tblName", String.valueOf(info.tblName));
params.put("sql", sql);
params.put("sampleExpr", getSampleExpression());
StatisticsUtil.execUpdate(ANALYZE_MV_PART, params);
}
params.remove("partId");
params.remove("sampleExpr");
params.put("type", column.getType().toString());
StatisticsUtil.execUpdate(ANALYZE_MV_COL, params);
Env.getCurrentEnv().getStatisticsCache()

View File

@ -40,7 +40,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
private static final String ANALYZE_PARTITION_SQL_TEMPLATE = INSERT_PART_STATISTICS
+ "FROM `${dbName}`.`${tblName}` "
+ "PARTITION ${partName}";
+ "PARTITION ${partName} ${sampleExpr}";
// TODO Currently, NDV is computed for the full table; in fact,
// NDV should only be computed for the relevant partition.
@ -64,12 +64,13 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("catalogId", String.valueOf(catalog.getId()));
params.put("dbId", String.valueOf(db.getId()));
params.put("tblId", String.valueOf(tbl.getId()));
params.put("idxId", "-1");
params.put("idxId", String.valueOf(info.indexId));
params.put("colId", String.valueOf(info.colName));
params.put("dataSizeFunction", getDataSizeFunction(col));
params.put("dbName", info.dbName);
params.put("colName", String.valueOf(info.colName));
params.put("tblName", String.valueOf(info.tblName));
params.put("sampleExpr", getSampleExpression());
List<String> partitionAnalysisSQLs = new ArrayList<>();
try {
tbl.readLock();
@ -90,6 +91,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
execSQLs(partitionAnalysisSQLs);
params.remove("partId");
params.remove("sampleExpr");
params.put("type", col.getType().toString());
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE);

View File

@ -68,4 +68,5 @@ public class StatisticConstants {
public static final int FETCH_LIMIT = 10000;
public static final int FETCH_INTERVAL_IN_MS = 500;
public static final int HISTOGRAM_MAX_BUCKET_NUM = 128;
}

View File

@ -418,6 +418,7 @@ import org.apache.doris.qe.SqlModeHelper;
keywordMap.put("soname", new Integer(SqlParserSymbols.KW_SONAME));
keywordMap.put("split", new Integer(SqlParserSymbols.KW_SPLIT));
keywordMap.put("sql_block_rule", new Integer(SqlParserSymbols.KW_SQL_BLOCK_RULE));
keywordMap.put("sample", new Integer(SqlParserSymbols.KW_SAMPLE));
keywordMap.put("start", new Integer(SqlParserSymbols.KW_START));
keywordMap.put("stats", new Integer(SqlParserSymbols.KW_STATS));
keywordMap.put("status", new Integer(SqlParserSymbols.KW_STATUS));

View File

@ -75,7 +75,7 @@ public class HistogramTaskTest extends TestWithFeService {
AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager();
StmtExecutor executor = getSqlStmtExecutor(
"ANALYZE TABLE t1 UPDATE HISTOGRAM ON col1 PARTITION (p_201701)");
"ANALYZE TABLE t1(col1) UPDATE HISTOGRAM");
Assertions.assertNotNull(executor);
ConcurrentMap<Long, Map<Long, AnalysisTaskInfo>> taskMap =

View File

@ -0,0 +1,7 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
city 6.0 4.0 0.0 42.0 6.0 '上海' '深圳'
-- !sql --
city VARCHAR(20) 0.0 4 [{"lower_expr":"上海","upper_expr":"上海","count":1.0,"pre_sum":0.0,"ndv":1.0},{"lower_expr":"北京","upper_expr":"北京","count":2.0,"pre_sum":1.0,"ndv":1.0},{"lower_expr":"广州","upper_expr":"广州","count":1.0,"pre_sum":3.0,"ndv":1.0},{"lower_expr":"深圳","upper_expr":"深圳","count":2.0,"pre_sum":4.0,"ndv":1.0}]

View File

@ -34,7 +34,7 @@ suite("alter_column_stats") {
sql """INSERT INTO statistics_test VALUES(3, 'c', '2013-01-01')"""
sql """ANALYZE TABLE statistics_test"""
sql """ANALYZE TABLE statistics_test UPDATE HISTOGRAM ON col1,col2"""
sql """ANALYZE TABLE statistics_test(col1, col2) UPDATE HISTOGRAM"""
sleep(9000)

View File

@ -29,6 +29,10 @@ suite("analyze_test") {
def tblName3 = "${dbName3}.analyze_test_tbl_3"
def dbName4 = "analyze_test_db_4"
def tblName4 = "${dbName4}.analyze_test_tbl_4"
sql """
DROP DATABASE IF EXISTS ${dbName1};
@ -55,6 +59,13 @@ suite("analyze_test") {
CREATE DATABASE ${dbName3};
"""
sql """
DROP DATABASE IF EXISTS ${dbName4}
"""
sql """
CREATE DATABASE ${dbName4};
"""
sql """
DROP TABLE IF EXISTS ${tblName1}
@ -95,6 +106,18 @@ suite("analyze_test") {
"enable_unique_key_merge_on_write"="true"
);"""
sql """
DROP TABLE IF EXISTS ${tblName4}
"""
sql """CREATE TABLE ${tblName4} (analyze_test_col1 varchar(11451) not null, analyze_test_col2 int not null, analyze_test_col3 int not null)
UNIQUE KEY(analyze_test_col1)
DISTRIBUTED BY HASH(analyze_test_col1)
BUCKETS 3
PROPERTIES(
"replication_num"="1",
"enable_unique_key_merge_on_write"="true"
);"""
sql """insert into ${tblName1} values(1, 2, 3);"""
sql """insert into ${tblName1} values(4, 5, 6);"""
@ -114,6 +137,12 @@ suite("analyze_test") {
sql """insert into ${tblName3} values(3, 8, 2);"""
sql """insert into ${tblName3} values(5, 2, 1);"""
sql """insert into ${tblName4} values(1, 2, 3);"""
sql """insert into ${tblName4} values(4, 5, 6);"""
sql """insert into ${tblName4} values(7, 1, 9);"""
sql """insert into ${tblName4} values(3, 8, 2);"""
sql """insert into ${tblName4} values(5, 2, 1);"""
sql """
delete from __internal_schema.column_statistics where col_id in ('analyze_test_col1', 'analyze_test_col2', 'analyze_test_col3')
"""
@ -142,6 +171,22 @@ suite("analyze_test") {
ALTER TABLE ${tblName3} DROP COLUMN analyze_test_col2;
"""
sql """
analyze sync table ${tblName4} with sample rows 5;
"""
sql """
analyze table ${tblName4} with sync with incremental with sample percent 20;
"""
sql """
analyze sync table ${tblName4} update histogram with sample percent 20 with buckets 2;
"""
sql """
analyze table ${tblName4} update histogram with sync with sample rows 20 with buckets 2;
"""
sql """
DROP TABLE ${tblName2}
"""
@ -150,6 +195,10 @@ suite("analyze_test") {
DROP DATABASE ${dbName1}
"""
sql """
DROP DATABASE ${dbName4}
"""
sql """
DROP EXPIRED STATS
"""

View File

@ -0,0 +1,78 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_show_stats") {
def dbName = "stats_test"
def tblName = "${dbName}.example_tbl"
sql "DROP DATABASE IF EXISTS ${dbName}"
sql "CREATE DATABASE IF NOT EXISTS ${dbName};"
sql "DROP TABLE IF EXISTS ${tblName}"
sql """
CREATE TABLE IF NOT EXISTS ${tblName} (
`user_id` LARGEINT NOT NULL,
`date` DATE NOT NULL,
`city` VARCHAR(20),
`age` SMALLINT,
`sex` TINYINT,
`last_visit_date` DATETIME REPLACE,
`cost` BIGINT SUM,
`max_dwell_time` INT MAX,
`min_dwell_time` INT MIN
) ENGINE=OLAP
AGGREGATE KEY(`user_id`, `date`, `city`, `age`, `sex`)
PARTITION BY LIST(`date`)
(
PARTITION `p_201701` VALUES IN ("2017-10-01"),
PARTITION `p_201702` VALUES IN ("2017-10-02"),
PARTITION `p_201703` VALUES IN ("2017-10-03"),
PARTITION `default`
)
DISTRIBUTED BY HASH(`user_id`) BUCKETS 1
PROPERTIES (
"replication_num" = "1"
);
"""
sql """
INSERT INTO ${tblName} (`user_id`, `date`, `city`, `age`,
`sex`, `last_visit_date`, `cost`,
`max_dwell_time`, `min_dwell_time`)
VALUES (10000, "2017-10-01", "北京", 20, 0, "2017-10-01 07:00:00", 15, 2, 2),
(10000, "2017-10-01", "北京", 20, 0, "2017-10-01 06:00:00", 20, 10, 10),
(10001, "2017-10-01", "北京", 30, 1, "2017-10-01 17:05:45", 2, 22, 22),
(10002, "2017-10-02", "上海", 20, 1, "2017-10-02 12:59:12", 200, 5, 5),
(10003, "2017-10-02", "广州", 32, 0, "2017-10-02 11:20:00", 30, 11, 11),
(10004, "2017-10-01", "深圳", 35, 0, "2017-10-01 10:00:15", 100, 3, 3),
(10004, "2017-10-03", "深圳", 35, 0, "2017-10-03 10:20:22", 11, 6, 6);
"""
sql "ANALYZE sync TABLE ${tblName};"
sql "ANALYZE sync TABLE ${tblName} UPDATE HISTOGRAM;"
qt_sql "SHOW COLUMN STATS ${tblName}(city);"
qt_sql "SHOW COLUMN HISTOGRAM ${tblName}(city);"
sql "DROP DATABASE IF EXISTS ${dbName}"
sql "CREATE DATABASE IF NOT EXISTS ${dbName};"
}