[improvement](statistics)Set min max to NULL when collect stats with sample (#25593)
1. To avoid misleading of inaccurate min max stats, set the stats value to NULL while using sample to collect stats. 2. Fix NDV_SAMPLE_TEMPLATE typo, it shouldn't contain row count related contents.
This commit is contained in:
@ -46,8 +46,7 @@ public abstract class BaseAnalysisTask {
|
||||
|
||||
protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
|
||||
|
||||
protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
|
||||
+ "case when NDV(`${colName}`)/count('${colName}') < "
|
||||
protected static final String NDV_SAMPLE_TEMPLATE = "case when NDV(`${colName}`)/count('${colName}') < "
|
||||
+ NDV_MULTIPLY_THRESHOLD
|
||||
+ " then NDV(`${colName}`) "
|
||||
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
|
||||
@ -239,6 +238,24 @@ public abstract class BaseAnalysisTask {
|
||||
return "COUNT(1) * " + column.getType().getSlotSize();
|
||||
}
|
||||
|
||||
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
|
||||
protected String getMinFunction() {
|
||||
if (tableSample == null) {
|
||||
return "MIN(`${colName}`) ";
|
||||
} else {
|
||||
return "NULL ";
|
||||
}
|
||||
}
|
||||
|
||||
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
|
||||
protected String getMaxFunction() {
|
||||
if (tableSample == null) {
|
||||
return "MAX(`${colName}`) ";
|
||||
} else {
|
||||
return "NULL ";
|
||||
}
|
||||
}
|
||||
|
||||
protected TableSample getTableSample() {
|
||||
if (info.forceFull) {
|
||||
return null;
|
||||
|
||||
@ -61,10 +61,11 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
+ "${idxId} AS idx_id, "
|
||||
+ "'${colId}' AS col_id, "
|
||||
+ "NULL AS part_id, "
|
||||
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
|
||||
+ NDV_SAMPLE_TEMPLATE
|
||||
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
|
||||
+ "MIN(`${colName}`) AS min, "
|
||||
+ "MAX(`${colName}`) AS max, "
|
||||
+ "${minFunction} AS min, "
|
||||
+ "${maxFunction} AS max, "
|
||||
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
|
||||
+ "NOW() "
|
||||
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}";
|
||||
@ -177,6 +178,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
sb.append(ANALYZE_TABLE_TEMPLATE);
|
||||
Map<String, String> params = buildStatsParams("NULL");
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col));
|
||||
params.put("minFunction", getMinFunction());
|
||||
params.put("maxFunction", getMaxFunction());
|
||||
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
|
||||
String sql = stringSubstitutor.replace(sb.toString());
|
||||
executeInsertSql(sql);
|
||||
|
||||
@ -81,11 +81,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
+ "${idxId} AS idx_id, "
|
||||
+ "'${colId}' AS col_id, "
|
||||
+ "NULL AS part_id, "
|
||||
+ "COUNT(1) * ${scaleFactor} AS row_count, "
|
||||
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
|
||||
+ NDV_SAMPLE_TEMPLATE
|
||||
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, "
|
||||
+ "MIN(`${colName}`) AS min, "
|
||||
+ "MAX(`${colName}`) AS max, "
|
||||
+ "NULL AS min, "
|
||||
+ "NULL AS max, "
|
||||
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
|
||||
+ "NOW() "
|
||||
+ "FROM `${dbName}`.`${tblName}`"
|
||||
|
||||
Reference in New Issue
Block a user