[opt](stats) Sampling when aggregate column stats (#21020)

In the previous implementation, when aggregating partition statistics into column statistics, the calculation of distinct values (ndv) for the entire column was performed without using sampling, resulting in reduced efficiency of the sampling process.

Before this PR analyze below table which has 1000000 lines would cost 5.75sec, after this PR, it would cost 3.39sec.


```sql
CREATE TABLE IF NOT EXISTS `duplicate_all` (
    `k3` int(11) null comment "",
    `k0` boolean null comment "",
    `k1` tinyint(4) null comment "",
    `k2` smallint(6) null comment "",
    `k4` bigint(20) null comment "",
    `k5` decimalv3(9, 3) null comment "",
    `k6` char(36) null comment "",
    `k10` date null comment "",
    `k11` datetime null comment "",
    `k7` varchar(64) null comment "",
    `k8` double null comment "",
    `k9` float null comment "",
    `k12` string  null comment "",
    `k13` largeint(40)  null comment ""
) engine=olap
DUPLICATE KEY(`k3`)
DISTRIBUTED BY HASH(`k3`) BUCKETS 5 properties("replication_num" = "3")
```
This commit is contained in:
AKIRA
2023-06-25 16:52:01 +09:00
committed by GitHub
parent dd99468b8f
commit cf66280e60

View File

@ -48,7 +48,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
// NDV should only be computed for the relevant partition.
private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS
+ " (SELECT NDV(`${colName}`) AS ndv "
+ " FROM `${dbName}`.`${tblName}`) t2\n";
+ " FROM `${dbName}`.`${tblName}` ${sampleExpr}) t2\n";
@VisibleForTesting
public OlapAnalysisTask() {
@ -93,7 +93,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
execSQLs(partitionAnalysisSQLs);
params.remove("partId");
params.remove("sampleExpr");
params.put("type", col.getType().toString());
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE);