[improvement](statistics)Use count as ndv for unique/agg olap table single key column (#27186)

Single key column of unique/agg olap table has the same value of count and ndv, for this kind of column,
don't need to calculate ndv, simply use count as ndv.
This commit is contained in:
Jibing-Li
2023-11-20 15:49:08 +08:00
committed by GitHub
parent 6ed0be8e3c
commit d939903753
4 changed files with 39 additions and 8 deletions

View File

@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask {
+ "${idxId} AS `idx_id`, "
+ "'${colId}' AS `col_id`, "
+ "NULL AS `part_id`, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, "
+ "ROUND(NDV(`${colName}`) * ${scaleFactor}) as `ndv`, "
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "

View File

@ -145,6 +145,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);

View File

@ -17,6 +17,7 @@
package org.apache.doris.statistics;
import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
@ -129,21 +130,26 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql;
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
Set<String> distributionColumns = tbl.getDistributionColumnNames();
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
if (useLinearAnalyzeTemplate()) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
// For single unique key, use count as ndv.
if (isSingleUniqueKey()) {
params.put("ndvFunction", String.valueOf(rowCount));
} else {
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
}
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
} else {
params.put("dataSizeFunction", getDataSizeFunction(col, true));
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}]",
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
+ "is single unique key [{}]",
col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"),
limitFlag, tbl.isDistributionColumn(col.getName()),
tbl.isPartitionColumn(col.getName()), col.isKey());
tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey());
runQuery(sql, false);
}
}
@ -278,4 +284,28 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
return sampleRows;
}
/**
* Check if the task should use linear analyze template.
* @return True for single unique key column and single distribution column.
*/
protected boolean useLinearAnalyzeTemplate() {
if (isSingleUniqueKey()) {
return true;
}
Set<String> distributionColumns = tbl.getDistributionColumnNames();
return distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase());
}
/**
* Check if the olap table has a single unique key.
* @return True if the table has a single unique/agg key. False otherwise.
*/
protected boolean isSingleUniqueKey() {
int keysNum = ((OlapTable) tbl).getKeysNum();
KeysType keysType = ((OlapTable) tbl).getKeysType();
return col.isKey()
&& keysNum == 1
&& (keysType.equals(KeysType.UNIQUE_KEYS) || keysType.equals(KeysType.AGG_KEYS));
}
}