[improvement](statistics)Set min max to NULL when collect stats with sample (#25593)

1. To avoid misleading of inaccurate min max stats, set the stats value to NULL while using sample to collect stats.
2. Fix NDV_SAMPLE_TEMPLATE typo, it shouldn't contain row count related contents.
This commit is contained in:
Jibing-Li
2023-10-19 18:00:55 +08:00
committed by GitHub
parent 68eaba7220
commit 4d2e7d7c86
6 changed files with 64 additions and 143 deletions

View File

@ -46,8 +46,7 @@ public abstract class BaseAnalysisTask {
protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ "case when NDV(`${colName}`)/count('${colName}') < "
protected static final String NDV_SAMPLE_TEMPLATE = "case when NDV(`${colName}`)/count('${colName}') < "
+ NDV_MULTIPLY_THRESHOLD
+ " then NDV(`${colName}`) "
+ "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
@ -239,6 +238,24 @@ public abstract class BaseAnalysisTask {
return "COUNT(1) * " + column.getType().getSlotSize();
}
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMinFunction() {
if (tableSample == null) {
return "MIN(`${colName}`) ";
} else {
return "NULL ";
}
}
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMaxFunction() {
if (tableSample == null) {
return "MAX(`${colName}`) ";
} else {
return "NULL ";
}
}
protected TableSample getTableSample() {
if (info.forceFull) {
return null;

View File

@ -61,10 +61,11 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "${minFunction} AS min, "
+ "${maxFunction} AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}";
@ -177,6 +178,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
sb.append(ANALYZE_TABLE_TEMPLATE);
Map<String, String> params = buildStatsParams("NULL");
params.put("dataSizeFunction", getDataSizeFunction(col));
params.put("minFunction", getMinFunction());
params.put("maxFunction", getMaxFunction());
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
executeInsertSql(sql);

View File

@ -81,11 +81,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
+ "COUNT(1) * ${scaleFactor} AS row_count, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "NULL AS min, "
+ "NULL AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${dbName}`.`${tblName}`"

View File

@ -1,99 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_hive_sample_statistic", "p2,external,hive,external_remote,external_remote_hive") {
String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort")
String catalog_name = "test_hive_sample_statistic"
sql """drop catalog if exists ${catalog_name};"""
sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hadoop.username' = 'hadoop',
'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
);
"""
logger.info("catalog " + catalog_name + " created")
sql """use ${catalog_name}.tpch_1000_parquet"""
sql """analyze table part with sample percent 10 with sync;"""
def result = sql """show table stats part"""
assertTrue(result.size() == 1)
assertTrue(Long.parseLong(result[0][2]) >= 200000000)
assertTrue(Long.parseLong(result[0][2]) < 220000000)
def ctlId
result = sql """show proc '/catalogs'"""
for (int i = 0; i < result.size(); i++) {
if (result[i][1] == catalog_name) {
ctlId = result[i][0]
}
}
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_partkey'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_name'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_mfgr'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_brand'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_type'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_size'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_container'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_retailprice'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_comment'"""
assertTrue(result.size() == 1)
assertTrue(result[0][0] >= 200000000)
assertTrue(result[0][0] < 220000000)
sql """drop catalog ${catalog_name}""";
}
}

View File

@ -44,7 +44,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "46.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "1")
assertTrue(result[0][7] == "50")
@ -55,7 +55,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "26.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "1")
assertTrue(result[0][7] == "98")
@ -66,7 +66,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "7.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "1")
assertTrue(result[0][7] == "7")
@ -77,7 +77,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "26.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "67423")
assertTrue(result[0][7] == "2735521")
@ -88,7 +88,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "100.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "2250")
assertTrue(result[0][7] == "989601")
@ -99,7 +99,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "100.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "4167")
assertTrue(result[0][7] == "195845")
@ -110,7 +110,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "26.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "19920221")
assertTrue(result[0][7] == "19980721")
@ -121,7 +121,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "5.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "888.8000000000001")
assertTrue(result[0][4] == "880.0")
assertTrue(result[0][5] == "8.8")
assertTrue(result[0][6] == "'1-URGENT'")
assertTrue(result[0][7] == "'5-LOW'")
@ -132,7 +132,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "1.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "0")
assertTrue(result[0][7] == "0")
@ -143,7 +143,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "100.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "104300")
assertTrue(result[0][7] == "9066094")
@ -154,7 +154,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "26.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "3428256")
assertTrue(result[0][7] == "36771805")
@ -165,7 +165,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "11.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "0")
assertTrue(result[0][7] == "10")
@ -176,7 +176,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "100.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "101171")
assertTrue(result[0][7] == "8703450")
@ -187,7 +187,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "100.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "58023")
assertTrue(result[0][7] == "121374")
@ -198,7 +198,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "9.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "0")
assertTrue(result[0][7] == "8")
@ -209,7 +209,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "95.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "404.0")
assertTrue(result[0][4] == "400.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "19920515")
assertTrue(result[0][7] == "19981016")
@ -220,7 +220,7 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][1] == "100.0")
assertTrue(result[0][2] == "7.0")
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "425.21")
assertTrue(result[0][4] == "421.0")
assertTrue(result[0][5] == "4.21")
assertTrue(result[0][6] == "'AIR'")
assertTrue(result[0][7] == "'TRUCK'")

View File

@ -42,8 +42,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "20.0")
assertTrue(result[0][5] == "4.0")
assertTrue(result[0][6] == "0")
assertTrue(result[0][7] == "4")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats region (r_name)"""
assertTrue(result.size() == 1)
@ -53,8 +53,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "34.0")
assertTrue(result[0][5] == "6.8")
assertTrue(result[0][6] == "\'AFRICA\'")
assertTrue(result[0][7] == "\'MIDDLE EAST\'")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats region (r_comment)"""
assertTrue(result.size() == 1)
@ -64,8 +64,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "330.0")
assertTrue(result[0][5] == "66.0")
assertTrue(result[0][6] == "\'ges. thinly even pinto beans ca\'")
assertTrue(result[0][7] == "\'uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl\'")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_suppkey)"""
assertTrue(result.size() == 1)
@ -75,8 +75,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "3.9995194E7")
assertTrue(result[0][5] == "3.9999997999759773")
assertTrue(result[0][6] == "1885331")
assertTrue(result[0][7] == "9395153")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_name)"""
assertTrue(result.size() == 1)
@ -86,8 +86,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "1.79978374E8")
assertTrue(result[0][5] == "17.999999199903908")
assertTrue(result[0][6] == "\'Supplier#001885331\'")
assertTrue(result[0][7] == "\'Supplier#009395153\'")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_address)"""
assertTrue(result.size() == 1)
@ -97,8 +97,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "2.50070604E8")
assertTrue(result[0][5] == "25.010064108699456")
assertTrue(result[0][6] == "\' E,WAW2ZEx\'")
assertTrue(result[0][7] == "\'zzzw X3bpxu,OCpzgv6BdyMVMKzaB1DbH\'")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_nationkey)"""
assertTrue(result.size() == 1)
@ -108,8 +108,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "3.9995194E7")
assertTrue(result[0][5] == "3.9999997999759773")
assertTrue(result[0][6] == "0")
assertTrue(result[0][7] == "24")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_phone)"""
assertTrue(result.size() == 1)
@ -119,8 +119,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "1.49981978E8")
assertTrue(result[0][5] == "14.99999929991592")
assertTrue(result[0][6] == "\'10-100-128-4513\'")
assertTrue(result[0][7] == "\'34-999-967-7296\'")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_acctbal)"""
assertTrue(result.size() == 1)
@ -130,8 +130,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "7.9990388E7")
assertTrue(result[0][5] == "7.999999599951955")
assertTrue(result[0][6] == "-999.99")
assertTrue(result[0][7] == "9999.99")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
result = sql """show column stats supplier (s_comment)"""
assertTrue(result.size() == 1)
@ -141,8 +141,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "6.24883849E8")
assertTrue(result[0][5] == "62.49589065646784")
assertTrue(result[0][6] == "\' Customer across the pinto beans. pinRecommends\'")
assertTrue(result[0][7] == "\'zzle? express, regular foxes haggle final ac\'")
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
sql """drop catalog ${catalog_name}"""
}