[opt](stats) Sampling when aggregate column stats (#21020)

In the previous implementation, when aggregating partition statistics into column statistics, the calculation of distinct values (ndv) for the entire column was performed without using sampling, resulting in reduced efficiency of the sampling process. Before this PR analyze below table which has 1000000 lines would cost 5.75sec, after this PR, it would cost 3.39sec. ```sql CREATE TABLE IF NOT EXISTS `duplicate_all` ( `k3` int(11) null comment "", `k0` boolean null comment "", `k1` tinyint(4) null comment "", `k2` smallint(6) null comment "", `k4` bigint(20) null comment "", `k5` decimalv3(9, 3) null comment "", `k6` char(36) null comment "", `k10` date null comment "", `k11` datetime null comment "", `k7` varchar(64) null comment "", `k8` double null comment "", `k9` float null comment "", `k12` string null comment "", `k13` largeint(40) null comment "" ) engine=olap DUPLICATE KEY(`k3`) DISTRIBUTED BY HASH(`k3`) BUCKETS 5 properties("replication_num" = "3") ```
2023-06-25 16:52:01 +09:00
parent dd99468b8f
commit cf66280e60
1 changed files with 1 additions and 2 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@ -48,7 +48,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
    //  NDV should only be computed for the relevant partition.
    private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS
            + "     (SELECT NDV(`${colName}`) AS ndv "
-            + "     FROM `${dbName}`.`${tblName}`) t2\n";
+            + "     FROM `${dbName}`.`${tblName}` ${sampleExpr}) t2\n";

    @VisibleForTesting
    public OlapAnalysisTask() {
@ -93,7 +93,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
        }
        execSQLs(partitionAnalysisSQLs);
        params.remove("partId");
-        params.remove("sampleExpr");
        params.put("type", col.getType().toString());
        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
        String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE);