[improvement](statistics)Set min max to NULL when collect stats with sample (#25593)

1. To avoid misleading of inaccurate min max stats, set the stats value to NULL while using sample to collect stats.
2. Fix NDV_SAMPLE_TEMPLATE typo, it shouldn't contain row count related contents.
This commit is contained in:
Jibing-Li
2023-10-19 18:00:55 +08:00
committed by GitHub
parent 68eaba7220
commit 4d2e7d7c86
6 changed files with 64 additions and 143 deletions

View File

@ -46,8 +46,7 @@ public abstract class BaseAnalysisTask {
protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ "case when NDV(`${colName}`)/count('${colName}') < "
protected static final String NDV_SAMPLE_TEMPLATE = "case when NDV(`${colName}`)/count('${colName}') < "
+ NDV_MULTIPLY_THRESHOLD
+ " then NDV(`${colName}`) "
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
@ -239,6 +238,24 @@ public abstract class BaseAnalysisTask {
return "COUNT(1) * " + column.getType().getSlotSize();
}
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMinFunction() {
if (tableSample == null) {
return "MIN(`${colName}`) ";
} else {
return "NULL ";
}
}
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMaxFunction() {
if (tableSample == null) {
return "MAX(`${colName}`) ";
} else {
return "NULL ";
}
}
protected TableSample getTableSample() {
if (info.forceFull) {
return null;

View File

@ -61,10 +61,11 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "${minFunction} AS min, "
+ "${maxFunction} AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}";
@ -177,6 +178,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
sb.append(ANALYZE_TABLE_TEMPLATE);
Map<String, String> params = buildStatsParams("NULL");
params.put("dataSizeFunction", getDataSizeFunction(col));
params.put("minFunction", getMinFunction());
params.put("maxFunction", getMaxFunction());
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
executeInsertSql(sql);

View File

@ -81,11 +81,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
+ "COUNT(1) * ${scaleFactor} AS row_count, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "NULL AS min, "
+ "NULL AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${dbName}`.`${tblName}`"