From cf66280e6023909c370ac2798678264a015f200a Mon Sep 17 00:00:00 2001 From: AKIRA <33112463+Kikyou1997@users.noreply.github.com> Date: Sun, 25 Jun 2023 16:52:01 +0900 Subject: [PATCH] [opt](stats) Sampling when aggregate column stats (#21020) In the previous implementation, when aggregating partition statistics into column statistics, the calculation of distinct values (ndv) for the entire column was performed without using sampling, resulting in reduced efficiency of the sampling process. Before this PR analyze below table which has 1000000 lines would cost 5.75sec, after this PR, it would cost 3.39sec. ```sql CREATE TABLE IF NOT EXISTS `duplicate_all` ( `k3` int(11) null comment "", `k0` boolean null comment "", `k1` tinyint(4) null comment "", `k2` smallint(6) null comment "", `k4` bigint(20) null comment "", `k5` decimalv3(9, 3) null comment "", `k6` char(36) null comment "", `k10` date null comment "", `k11` datetime null comment "", `k7` varchar(64) null comment "", `k8` double null comment "", `k9` float null comment "", `k12` string null comment "", `k13` largeint(40) null comment "" ) engine=olap DUPLICATE KEY(`k3`) DISTRIBUTED BY HASH(`k3`) BUCKETS 5 properties("replication_num" = "3") ``` --- .../java/org/apache/doris/statistics/OlapAnalysisTask.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index d33248e873..47df548e84 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -48,7 +48,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask { // NDV should only be computed for the relevant partition. private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS + " (SELECT NDV(`${colName}`) AS ndv " - + " FROM `${dbName}`.`${tblName}`) t2\n"; + + " FROM `${dbName}`.`${tblName}` ${sampleExpr}) t2\n"; @VisibleForTesting public OlapAnalysisTask() { @@ -93,7 +93,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask { } execSQLs(partitionAnalysisSQLs); params.remove("partId"); - params.remove("sampleExpr"); params.put("type", col.getType().toString()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE);