[enhancement](stats) Unify sample algorithm between olap table and external table (#25472)
To reduce error of ndv estimation of olap table's column
This commit is contained in:
@ -44,6 +44,15 @@ public abstract class BaseAnalysisTask {
|
||||
|
||||
public static final Logger LOG = LogManager.getLogger(BaseAnalysisTask.class);
|
||||
|
||||
protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
|
||||
|
||||
protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
|
||||
+ "case when NDV(`${colName}`)/count('${colName}') < "
|
||||
+ NDV_MULTIPLY_THRESHOLD
|
||||
+ " then NDV(`${colName}`) "
|
||||
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
|
||||
;
|
||||
|
||||
/**
|
||||
* Stats stored in the column_statistics table basically has two types, `part_id` is null which means it is
|
||||
* aggregate from partition level stats, `part_id` is not null which means it is partition level stats.
|
||||
|
||||
@ -50,7 +50,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
|
||||
// While doing sample analysis, the sampled ndv result will multiply a factor (total size/sample size)
|
||||
// if ndv(col)/count(col) is greater than this threshold.
|
||||
private static final String NDV_MULTIPLY_THRESHOLD = "0.3";
|
||||
|
||||
private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO "
|
||||
+ "${internalDB}.${columnStatTbl}"
|
||||
@ -62,11 +61,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
+ "${idxId} AS idx_id, "
|
||||
+ "'${colId}' AS col_id, "
|
||||
+ "NULL AS part_id, "
|
||||
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
|
||||
+ "case when NDV(`${colName}`)/count('${colName}') < "
|
||||
+ NDV_MULTIPLY_THRESHOLD
|
||||
+ " then NDV(`${colName}`) "
|
||||
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
|
||||
+ NDV_SAMPLE_TEMPLATE
|
||||
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
|
||||
+ "MIN(`${colName}`) AS min, "
|
||||
+ "MAX(`${colName}`) AS max, "
|
||||
|
||||
@ -54,7 +54,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
// NDV should only be computed for the relevant partition.
|
||||
private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS
|
||||
+ " (SELECT NDV(`${colName}`) AS ndv "
|
||||
+ " FROM `${dbName}`.`${tblName}`) t2\n";
|
||||
+ " FROM `${dbName}`.`${tblName}`) t2";
|
||||
|
||||
private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE =
|
||||
" SELECT "
|
||||
@ -73,22 +73,22 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
+ "${dataSizeFunction} AS data_size, "
|
||||
+ "NOW() FROM `${dbName}`.`${tblName}` PARTITION ${partitionName}";
|
||||
|
||||
private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT \n"
|
||||
+ "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, \n"
|
||||
+ "${catalogId} AS catalog_id, \n"
|
||||
+ "${dbId} AS db_id, \n"
|
||||
+ "${tblId} AS tbl_id, \n"
|
||||
+ "${idxId} AS idx_id, \n"
|
||||
+ "'${colId}' AS col_id, \n"
|
||||
+ "NULL AS part_id, \n"
|
||||
+ "COUNT(1) * ${scaleFactor} AS row_count, \n"
|
||||
+ "NDV(`${colName}`) * ${scaleFactor} AS ndv, \n"
|
||||
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, \n"
|
||||
+ "MIN(`${colName}`) AS min, \n"
|
||||
+ "MAX(`${colName}`) AS max, \n"
|
||||
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, \n"
|
||||
+ "NOW()\n"
|
||||
+ "FROM `${dbName}`.`${tblName}`\n"
|
||||
private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT "
|
||||
+ "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
|
||||
+ "${catalogId} AS catalog_id, "
|
||||
+ "${dbId} AS db_id, "
|
||||
+ "${tblId} AS tbl_id, "
|
||||
+ "${idxId} AS idx_id, "
|
||||
+ "'${colId}' AS col_id, "
|
||||
+ "NULL AS part_id, "
|
||||
+ "COUNT(1) * ${scaleFactor} AS row_count, "
|
||||
+ NDV_SAMPLE_TEMPLATE
|
||||
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, "
|
||||
+ "MIN(`${colName}`) AS min, "
|
||||
+ "MAX(`${colName}`) AS max, "
|
||||
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
|
||||
+ "NOW() "
|
||||
+ "FROM `${dbName}`.`${tblName}`"
|
||||
+ "${tablets}";
|
||||
|
||||
// cache stats for each partition, it would be inserted into column_statistics in a batch.
|
||||
|
||||
Reference in New Issue
Block a user