[improvement](statistics)Use count as ndv for unique/agg olap table single key column (#27186)
Single key column of unique/agg olap table has the same value of count and ndv, for this kind of column, don't need to calculate ndv, simply use count as ndv.
This commit is contained in:
@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask {
|
||||
+ "${idxId} AS `idx_id`, "
|
||||
+ "'${colId}' AS `col_id`, "
|
||||
+ "NULL AS `part_id`, "
|
||||
+ "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, "
|
||||
+ "ROUND(NDV(`${colName}`) * ${scaleFactor}) as `ndv`, "
|
||||
+ "${rowCount} AS `row_count`, "
|
||||
+ "${ndvFunction} as `ndv`, "
|
||||
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, "
|
||||
+ "${min} AS `min`, "
|
||||
+ "${max} AS `max`, "
|
||||
|
||||
@ -145,6 +145,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
|
||||
bucketFlag = true;
|
||||
sb.append(LINEAR_ANALYZE_TEMPLATE);
|
||||
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
|
||||
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
|
||||
} else {
|
||||
sb.append(DUJ1_ANALYZE_TEMPLATE);
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
package org.apache.doris.statistics;
|
||||
|
||||
import org.apache.doris.catalog.KeysType;
|
||||
import org.apache.doris.catalog.MaterializedIndex;
|
||||
import org.apache.doris.catalog.OlapTable;
|
||||
import org.apache.doris.catalog.Partition;
|
||||
@ -129,21 +130,26 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
}
|
||||
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
|
||||
String sql;
|
||||
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
|
||||
Set<String> distributionColumns = tbl.getDistributionColumnNames();
|
||||
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
|
||||
if (useLinearAnalyzeTemplate()) {
|
||||
params.put("min", StatisticsUtil.quote(min));
|
||||
params.put("max", StatisticsUtil.quote(max));
|
||||
// For single unique key, use count as ndv.
|
||||
if (isSingleUniqueKey()) {
|
||||
params.put("ndvFunction", String.valueOf(rowCount));
|
||||
} else {
|
||||
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
|
||||
}
|
||||
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
|
||||
} else {
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, true));
|
||||
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
|
||||
}
|
||||
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
|
||||
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}]",
|
||||
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
|
||||
+ "is single unique key [{}]",
|
||||
col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"),
|
||||
limitFlag, tbl.isDistributionColumn(col.getName()),
|
||||
tbl.isPartitionColumn(col.getName()), col.isKey());
|
||||
tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey());
|
||||
runQuery(sql, false);
|
||||
}
|
||||
}
|
||||
@ -278,4 +284,28 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
}
|
||||
return sampleRows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the task should use linear analyze template.
|
||||
* @return True for single unique key column and single distribution column.
|
||||
*/
|
||||
protected boolean useLinearAnalyzeTemplate() {
|
||||
if (isSingleUniqueKey()) {
|
||||
return true;
|
||||
}
|
||||
Set<String> distributionColumns = tbl.getDistributionColumnNames();
|
||||
return distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the olap table has a single unique key.
|
||||
* @return True if the table has a single unique/agg key. False otherwise.
|
||||
*/
|
||||
protected boolean isSingleUniqueKey() {
|
||||
int keysNum = ((OlapTable) tbl).getKeysNum();
|
||||
KeysType keysType = ((OlapTable) tbl).getKeysType();
|
||||
return col.isKey()
|
||||
&& keysNum == 1
|
||||
&& (keysType.equals(KeysType.UNIQUE_KEYS) || keysType.equals(KeysType.AGG_KEYS));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user