[enhancement](stats) Unify sample algorithm between olap table and external table (#25472)

To reduce error of ndv estimation of olap table's column
This commit is contained in:
AKIRA
2023-10-18 14:18:40 +08:00
committed by GitHub
parent 0ec537edef
commit 8a8e6edba9
3 changed files with 27 additions and 23 deletions

View File

@ -44,6 +44,15 @@ public abstract class BaseAnalysisTask {
public static final Logger LOG = LogManager.getLogger(BaseAnalysisTask.class);
protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ "case when NDV(`${colName}`)/count('${colName}') < "
+ NDV_MULTIPLY_THRESHOLD
+ " then NDV(`${colName}`) "
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
;
/**
* Stats stored in the column_statistics table basically has two types, `part_id` is null which means it is
* aggregate from partition level stats, `part_id` is not null which means it is partition level stats.

View File

@ -50,7 +50,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
// While doing sample analysis, the sampled ndv result will multiply a factor (total size/sample size)
// if ndv(col)/count(col) is greater than this threshold.
private static final String NDV_MULTIPLY_THRESHOLD = "0.3";
private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
@ -62,11 +61,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ "case when NDV(`${colName}`)/count('${colName}') < "
+ NDV_MULTIPLY_THRESHOLD
+ " then NDV(`${colName}`) "
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
+ NDV_SAMPLE_TEMPLATE
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "

View File

@ -54,7 +54,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
// NDV should only be computed for the relevant partition.
private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS
+ " (SELECT NDV(`${colName}`) AS ndv "
+ " FROM `${dbName}`.`${tblName}`) t2\n";
+ " FROM `${dbName}`.`${tblName}`) t2";
private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE =
" SELECT "
@ -73,22 +73,22 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
+ "${dataSizeFunction} AS data_size, "
+ "NOW() FROM `${dbName}`.`${tblName}` PARTITION ${partitionName}";
private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT \n"
+ "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, \n"
+ "${catalogId} AS catalog_id, \n"
+ "${dbId} AS db_id, \n"
+ "${tblId} AS tbl_id, \n"
+ "${idxId} AS idx_id, \n"
+ "'${colId}' AS col_id, \n"
+ "NULL AS part_id, \n"
+ "COUNT(1) * ${scaleFactor} AS row_count, \n"
+ "NDV(`${colName}`) * ${scaleFactor} AS ndv, \n"
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, \n"
+ "MIN(`${colName}`) AS min, \n"
+ "MAX(`${colName}`) AS max, \n"
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, \n"
+ "NOW()\n"
+ "FROM `${dbName}`.`${tblName}`\n"
private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT "
+ "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+ "${catalogId} AS catalog_id, "
+ "${dbId} AS db_id, "
+ "${tblId} AS tbl_id, "
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
+ "COUNT(1) * ${scaleFactor} AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${dbName}`.`${tblName}`"
+ "${tablets}";
// cache stats for each partition, it would be inserted into column_statistics in a batch.