From 26e81b65739814506e3d0b936c6f08718d65808e Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:00:56 +0800 Subject: [PATCH] [fix](stats)min and max return NaN when table is empty (#27862) fix analyze empty table and min/max null value bug: 1. Skip empty analyze task for sample analyze task. (Full analyze task already skipped). 2. Check sample rows is not 0 before calculate the scale factor. 3. Remove ' in sql template after remove base64 encoding for min/max value. --- .../doris/statistics/BaseAnalysisTask.java | 10 +++--- .../doris/statistics/HMSAnalysisTask.java | 11 ++++--- .../doris/statistics/JdbcAnalysisTask.java | 2 +- .../doris/statistics/OlapAnalysisTask.java | 32 ++++++++++--------- .../apache/doris/statistics/AnalyzeTest.java | 2 +- .../doris/statistics/HMSAnalysisTaskTest.java | 22 +++++++++++++ .../statistics/OlapAnalysisTaskTest.java | 9 ++---- 7 files changed, 56 insertions(+), 32 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 34d362161f..cdd5c3fc69 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -92,8 +92,8 @@ public abstract class BaseAnalysisTask { + "${rowCount} AS `row_count`, " + "${ndvFunction} as `ndv`, " + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " - + "'${min}' AS `min`, " - + "'${max}' AS `max`, " + + "${min} AS `min`, " + + "${max} AS `max`, " + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM ( " @@ -115,8 +115,8 @@ public abstract class BaseAnalysisTask { + "${row_count} AS `row_count`, " + "${ndv} AS `ndv`, " + "${null_count} AS `null_count`, " - + "'${min}' AS `min`, " - + "'${max}' AS `max`, " + + "${min} AS `min`, " + + "${max} AS `max`, " + "${data_size} AS `data_size`, " + "NOW() "; @@ -311,7 +311,7 @@ public abstract class BaseAnalysisTask { this.job = job; } - protected void runQuery(String sql, boolean needEncode) { + protected void runQuery(String sql) { long startTime = System.currentTimeMillis(); try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) { stmtExecutor = new StmtExecutor(a.connectContext, sql); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 7bd540de2c..efd99d1eca 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -159,7 +159,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { } stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); - runQuery(sql, true); + runQuery(sql); } // Collect the partition column stats through HMS metadata. @@ -201,12 +201,12 @@ public class HMSAnalysisTask extends BaseAnalysisTask { params.put("row_count", String.valueOf(count)); params.put("ndv", String.valueOf(ndv)); params.put("null_count", String.valueOf(numNulls)); - params.put("min", min); - params.put("max", max); + params.put("min", StatisticsUtil.quote(min)); + params.put("max", StatisticsUtil.quote(max)); params.put("data_size", String.valueOf(dataSize)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE); - runQuery(sql, true); + runQuery(sql); } private String updateMinValue(String currentMin, String value) { @@ -313,6 +313,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask { for (long size : chunkSizes) { total += size; } + if (total == 0) { + return Pair.of(1.0, 0L); + } // Calculate the sample target size for percent and rows sample. if (tableSample.isPercent()) { target = total * tableSample.getSampleValue() / 100; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java index e2e83aa8fa..50c437fa8f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java @@ -110,7 +110,7 @@ public class JdbcAnalysisTask extends BaseAnalysisTask { params.put("dataSizeFunction", getDataSizeFunction(col, false)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); - runQuery(sql, true); + runQuery(sql); } private Map buildTableStatsParams(String partId) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 45ccdbb0d6..bf144b1a11 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -59,7 +59,13 @@ public class OlapAnalysisTask extends BaseAnalysisTask { } public void doExecute() throws Exception { - + Set partitionNames = info.colToPartitions.get(info.colName); + if (partitionNames.isEmpty()) { + LOG.debug("Skip empty empty partition task for column {} in {}.{}.{}", + info.catalogId, info.dbId, info.tblId, info.colName); + job.appendBuf(this, Collections.emptyList()); + return; + } if (tableSample != null) { doSample(); } else { @@ -113,24 +119,25 @@ public class OlapAnalysisTask extends BaseAnalysisTask { params.put("scaleFactor", String.valueOf(scaleFactor)); params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); params.put("ndvFunction", getNdvFunction(String.valueOf(rowCount))); - params.put("min", min); - params.put("max", max); + params.put("min", StatisticsUtil.quote(min)); + params.put("max", StatisticsUtil.quote(max)); params.put("rowCount", String.valueOf(rowCount)); params.put("type", col.getType().toString()); params.put("limit", ""); if (needLimit()) { // If the tablets to be sampled are too large, use limit to control the rows to read, and re-calculate // the scaleFactor. - limitFlag = true; rowsToSample = Math.min(getSampleRows(), pair.second); - params.put("limit", "limit " + rowsToSample); - params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample)); + // Empty table doesn't need to limit. + if (rowsToSample > 0) { + limitFlag = true; + params.put("limit", "limit " + rowsToSample); + params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample)); + } } StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql; if (useLinearAnalyzeTemplate()) { - params.put("min", StatisticsUtil.quote(min)); - params.put("max", StatisticsUtil.quote(max)); // For single unique key, use count as ndv. if (isSingleUniqueKey()) { params.put("ndvFunction", String.valueOf(rowCount)); @@ -148,7 +155,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask { col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"), limitFlag, tbl.isDistributionColumn(col.getName()), tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey()); - runQuery(sql, false); + runQuery(sql); } } @@ -169,11 +176,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask { */ protected void doFull() throws Exception { LOG.debug("Will do full collection for column {}", col.getName()); - Set partitionNames = info.colToPartitions.get(info.colName); - if (partitionNames.isEmpty()) { - job.appendBuf(this, Collections.emptyList()); - return; - } Map params = new HashMap<>(); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); @@ -189,7 +191,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask { params.put("tblName", String.valueOf(tbl.getName())); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String collectColStats = stringSubstitutor.replace(COLLECT_COL_STATISTICS); - runQuery(collectColStats, true); + runQuery(collectColStats); } // Get sample tablets id and scale up scaleFactor diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java index 74d52cf20f..268540885d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java @@ -158,7 +158,7 @@ public class AnalyzeTest extends TestWithFeService { new MockUp() { @Mock - protected void runQuery(String sql, boolean needEncode) {} + protected void runQuery(String sql) {} }; HashMap> colToPartitions = Maps.newHashMap(); colToPartitions.put("col1", Collections.singleton("t1")); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java index 24a74053bb..a569a5cb06 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java @@ -21,6 +21,7 @@ import org.apache.doris.analysis.TableSample; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.catalog.external.HMSExternalTable; +import org.apache.doris.common.Pair; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.collect.Lists; @@ -138,4 +139,25 @@ public class HMSAnalysisTaskTest { Assertions.assertEquals(1000, tableSample.getSampleValue()); } + @Test + public void testGetSampleInfo(@Mocked HMSExternalTable tableIf) + throws Exception { + new MockUp() { + @Mock + public List getChunkSizes() { + return Lists.newArrayList(); + } + }; + HMSAnalysisTask task = new HMSAnalysisTask(); + task.setTable(tableIf); + task.tableSample = null; + Pair info1 = task.getSampleInfo(); + Assertions.assertEquals(1.0, info1.first); + Assertions.assertEquals(0, info1.second); + task.tableSample = new TableSample(false, 100L); + Pair info2 = task.getSampleInfo(); + Assertions.assertEquals(1.0, info2.first); + Assertions.assertEquals(0, info2.second); + } + } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 7b7894e54b..ed9122f70b 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -149,8 +149,7 @@ public class OlapAnalysisTaskTest { } @Mock - public void runQuery(String sql, boolean needEncode) { - Assertions.assertFalse(needEncode); + public void runQuery(String sql) { Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); return; } @@ -216,8 +215,7 @@ public class OlapAnalysisTaskTest { } @Mock - public void runQuery(String sql, boolean needEncode) { - Assertions.assertFalse(needEncode); + public void runQuery(String sql) { Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); return; } @@ -290,8 +288,7 @@ public class OlapAnalysisTaskTest { } @Mock - public void runQuery(String sql, boolean needEncode) { - Assertions.assertFalse(needEncode); + public void runQuery(String sql) { Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); return; }