[opt](stats) Sampling when aggregate column stats (#21020)
In the previous implementation, when aggregating partition statistics into column statistics, the calculation of distinct values (ndv) for the entire column was performed without using sampling, resulting in reduced efficiency of the sampling process.
Before this PR analyze below table which has 1000000 lines would cost 5.75sec, after this PR, it would cost 3.39sec.
```sql
CREATE TABLE IF NOT EXISTS `duplicate_all` (
`k3` int(11) null comment "",
`k0` boolean null comment "",
`k1` tinyint(4) null comment "",
`k2` smallint(6) null comment "",
`k4` bigint(20) null comment "",
`k5` decimalv3(9, 3) null comment "",
`k6` char(36) null comment "",
`k10` date null comment "",
`k11` datetime null comment "",
`k7` varchar(64) null comment "",
`k8` double null comment "",
`k9` float null comment "",
`k12` string null comment "",
`k13` largeint(40) null comment ""
) engine=olap
DUPLICATE KEY(`k3`)
DISTRIBUTED BY HASH(`k3`) BUCKETS 5 properties("replication_num" = "3")
```
This commit is contained in:
@ -48,7 +48,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
// NDV should only be computed for the relevant partition.
|
||||
private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS
|
||||
+ " (SELECT NDV(`${colName}`) AS ndv "
|
||||
+ " FROM `${dbName}`.`${tblName}`) t2\n";
|
||||
+ " FROM `${dbName}`.`${tblName}` ${sampleExpr}) t2\n";
|
||||
|
||||
@VisibleForTesting
|
||||
public OlapAnalysisTask() {
|
||||
@ -93,7 +93,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
}
|
||||
execSQLs(partitionAnalysisSQLs);
|
||||
params.remove("partId");
|
||||
params.remove("sampleExpr");
|
||||
params.put("type", col.getType().toString());
|
||||
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
|
||||
String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE);
|
||||
|
||||
Reference in New Issue
Block a user