From f4c5ce260b4ac4b3974bcd8ef7dcd059ecfdd78b Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Wed, 27 Dec 2023 23:04:37 +0800 Subject: [PATCH] [fix](statistics)Fix rowCount==0 while analyzing bug (#28969) Sample analyzing need to get row count by using table.getRowCount(). This method is not updated in real time, which may cause the sample task to scan whole table. This pr is to fix this. Set the flag that indicate the analyze job is for an empty table and skip scan the table. Meanwhile, don't reset updatedRows in this case. Set hugeTableAutoAnalyzeIntervalInMillis = 0 because all default huge table size has been set to 0. --- docs/en/docs/query-acceleration/statistics.md | 4 ++-- docs/zh-CN/docs/query-acceleration/statistics.md | 4 ++-- .../main/java/org/apache/doris/qe/SessionVariable.java | 2 +- .../java/org/apache/doris/statistics/AnalysisInfo.java | 7 ++++++- .../apache/doris/statistics/AnalysisInfoBuilder.java | 10 ++++++++-- .../org/apache/doris/statistics/AnalysisManager.java | 1 + .../org/apache/doris/statistics/OlapAnalysisTask.java | 3 ++- .../apache/doris/statistics/StatisticConstants.java | 2 +- .../doris/statistics/StatisticsAutoCollector.java | 1 + .../org/apache/doris/statistics/TableStatsMeta.java | 2 +- .../doris/statistics/StatisticsAutoCollectorTest.java | 2 +- regression-test/suites/statistics/analyze_stats.groovy | 2 +- 12 files changed, 27 insertions(+), 13 deletions(-) diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index c7a5827758..4cb0891172 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -295,8 +295,8 @@ mysql> KILL ANALYZE 52357; |auto_analyze_end_time|End time for automatic statistics collection|23:59:59| |enable_auto_analyze|Enable automatic collection functionality|true| |huge_table_default_sample_rows|Sampling rows for large tables|4194304| -|huge_table_lower_bound_size_in_bytes|Tables with size greater than this value will be automatically sampled during collection of statistics|5368709120| -|huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Tables with sizes greater than `huge_table_lower_bound_size_in_bytes * 5` will be ANALYZEed only once within this time interval.|43200000| +|huge_table_lower_bound_size_in_bytes|Tables with size greater than this value will be automatically sampled during collection of statistics|0| +|huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Tables with sizes greater than `huge_table_lower_bound_size_in_bytes * 5` will be ANALYZEed only once within this time interval.|0| |table_stats_health_threshold|Ranges from 0 to 100. If data updates since the last statistics collection exceed `(100 - table_stats_health_threshold)%`, the table's statistics are considered outdated.|60| |analyze_timeout|Controls the timeout for synchronous ANALYZE in seconds|43200| |auto_analyze_table_width_threshold|Controls the maximum width of table that will be auto analyzed. Table with more columns than this value will not be auto analyzed.|70| diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index 20b535e357..bff100fa98 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -299,8 +299,8 @@ mysql> KILL ANALYZE 52357; |auto_analyze_end_time|自动统计信息收集结束时间|23:59:59| |enable_auto_analyze|开启自动收集功能|true| |huge_table_default_sample_rows|对大表的采样行数|4194304| -|huge_table_lower_bound_size_in_bytes|大小超过该值的的表,在自动收集时将会自动通过采样收集统计信息|5368709120| -|huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes * 5的表仅ANALYZE一次|43200000| +|huge_table_lower_bound_size_in_bytes|大小超过该值的的表,在自动收集时将会自动通过采样收集统计信息|0| +|huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes * 5的表仅ANALYZE一次|0| |table_stats_health_threshold|取值在0-100之间,当自上次统计信息收集操作之后,数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时|60| |analyze_timeout|控制ANALYZE超时时间,单位为秒|43200| |auto_analyze_table_width_threshold|控制自动统计信息收集处理的最大表宽度,列数大于该值的表不会参与自动统计信息收集|70| diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 3554505990..c1ea2f29ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -1450,7 +1450,7 @@ public class SessionVariable implements Serializable, Writable { "This controls the minimum time interval for automatic ANALYZE on large tables." + "Within this interval," + "tables larger than huge_table_lower_bound_size_in_bytes are analyzed only once."}) - public long hugeTableAutoAnalyzeIntervalInMillis = TimeUnit.HOURS.toMillis(12); + public long hugeTableAutoAnalyzeIntervalInMillis = TimeUnit.HOURS.toMillis(0); @VariableMgr.VarAttr(name = EXTERNAL_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS, flag = VariableMgr.GLOBAL, description = {"控制对外表的自动ANALYZE的最小时间间隔,在该时间间隔内的外表仅ANALYZE一次", diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java index 65bb4a5dd9..aaff9e5992 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java @@ -188,6 +188,9 @@ public class AnalysisInfo implements Writable { @SerializedName("endTime") public long endTime; + + @SerializedName("emptyJob") + public final boolean emptyJob; /** * * Used to store the newest partition version of tbl when creating this job. @@ -202,7 +205,7 @@ public class AnalysisInfo implements Writable { long lastExecTimeInMs, long timeCostInMs, AnalysisState state, ScheduleType scheduleType, boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition, boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull, - boolean usingSqlForPartitionColumn, long tblUpdateTime) { + boolean usingSqlForPartitionColumn, long tblUpdateTime, boolean emptyJob) { this.jobId = jobId; this.taskId = taskId; this.taskIds = taskIds; @@ -238,6 +241,7 @@ public class AnalysisInfo implements Writable { this.forceFull = forceFull; this.usingSqlForPartitionColumn = usingSqlForPartitionColumn; this.tblUpdateTime = tblUpdateTime; + this.emptyJob = emptyJob; } @Override @@ -279,6 +283,7 @@ public class AnalysisInfo implements Writable { } sj.add("forceFull: " + forceFull); sj.add("usingSqlForPartitionColumn: " + usingSqlForPartitionColumn); + sj.add("emptyJob: " + emptyJob); return sj.toString(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index 204aba6d0f..310b7816ec 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -61,8 +61,8 @@ public class AnalysisInfoBuilder { private CronExpression cronExpression; private boolean forceFull; private boolean usingSqlForPartitionColumn; - private long tblUpdateTime; + private boolean emptyJob; public AnalysisInfoBuilder() { } @@ -100,6 +100,7 @@ public class AnalysisInfoBuilder { forceFull = info.forceFull; usingSqlForPartitionColumn = info.usingSqlForPartitionColumn; tblUpdateTime = info.tblUpdateTime; + emptyJob = info.emptyJob; } public AnalysisInfoBuilder setJobId(long jobId) { @@ -262,12 +263,17 @@ public class AnalysisInfoBuilder { return this; } + public AnalysisInfoBuilder setEmptyJob(boolean emptyJob) { + this.emptyJob = emptyJob; + return this; + } + public AnalysisInfo build() { return new AnalysisInfo(jobId, taskId, taskIds, catalogId, dbId, tblId, colToPartitions, partitionNames, colName, indexId, jobType, analysisMode, analysisMethod, analysisType, samplePercent, sampleRows, maxBucketNum, periodTimeInMs, message, lastExecTimeInMs, timeCostInMs, state, scheduleType, externalTableLevelTask, partitionOnly, samplingPartition, isAllPartition, partitionCount, - cronExpression, forceFull, usingSqlForPartitionColumn, tblUpdateTime); + cronExpression, forceFull, usingSqlForPartitionColumn, tblUpdateTime, emptyJob); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 0bf24e0c28..39ae191d45 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -519,6 +519,7 @@ public class AnalysisManager implements Writable { infoBuilder.setColToPartitions(colToPartitions); infoBuilder.setTaskIds(Lists.newArrayList()); infoBuilder.setTblUpdateTime(table.getUpdateTime()); + infoBuilder.setEmptyJob(table instanceof OlapTable && table.getRowCount() == 0); return infoBuilder.build(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index e062e4eef8..81348c1f94 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -61,7 +61,8 @@ public class OlapAnalysisTask extends BaseAnalysisTask { public void doExecute() throws Exception { Set partitionNames = info.colToPartitions.get(info.colName); - if (partitionNames == null || partitionNames.isEmpty()) { + if ((info.emptyJob && info.analysisMethod.equals(AnalysisInfo.AnalysisMethod.SAMPLE)) + || partitionNames == null || partitionNames.isEmpty()) { if (partitionNames == null) { LOG.warn("Table {}.{}.{}, partitionNames for column {} is null. ColToPartitions:[{}]", info.catalogId, info.dbId, info.tblId, info.colName, info.colToPartitions); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index 3d6d2fe52a..857a50e234 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -88,7 +88,7 @@ public class StatisticConstants { public static final long HUGE_TABLE_DEFAULT_SAMPLE_ROWS = 4194304; public static final long HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES = 0; - public static final long HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(12); + public static final long HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(0); public static final long EXTERNAL_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(24); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index ee50471175..f799da5620 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -170,6 +170,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { .setLastExecTimeInMs(System.currentTimeMillis()) .setJobType(JobType.SYSTEM) .setTblUpdateTime(table.getUpdateTime()) + .setEmptyJob(table instanceof OlapTable && table.getRowCount() == 0) .build(); analysisInfos.add(jobInfo); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java index f500ab09f0..eb6672ffe1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -149,7 +149,7 @@ public class TableStatsMeta implements Writable { if (tableIf instanceof OlapTable) { rowCount = tableIf.getRowCount(); } - if (analyzedJob.colToPartitions.keySet() + if (!analyzedJob.emptyJob && analyzedJob.colToPartitions.keySet() .containsAll(tableIf.getBaseSchema().stream() .filter(c -> !StatisticsUtil.isUnsupportedType(c.getType())) .map(Column::getName).collect(Collectors.toSet()))) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java index 0b4b2203d0..87342202fb 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java @@ -299,7 +299,7 @@ public class StatisticsAutoCollectorTest { // A very huge table has been updated recently, so we should skip it this time stats.updatedTime = System.currentTimeMillis() - 1000; StatisticsAutoCollector autoCollector = new StatisticsAutoCollector(); - Assertions.assertTrue(autoCollector.skip(olapTable)); + Assertions.assertFalse(autoCollector.skip(olapTable)); // The update of this huge table is long time ago, so we shouldn't skip it this time stats.updatedTime = System.currentTimeMillis() - StatisticsUtil.getHugeTableAutoAnalyzeIntervalInMillis() - 10000; diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index e7e89f858f..64967280ce 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -1168,7 +1168,7 @@ PARTITION `p599` VALUES IN (599) sql """ INSERT INTO test_updated_rows SELECT * FROM test_updated_rows """ sql """ANALYZE TABLE test_updated_rows WITH SYNC""" def cnt2 = sql """ SHOW TABLE STATS test_updated_rows """ - assertEquals(Integer.valueOf(cnt2[0][0]), 0) + assertTrue(Integer.valueOf(cnt2[0][0]) == 0 || Integer.valueOf(cnt2[0][0]) == 8) // test analyze specific column sql """CREATE TABLE test_analyze_specific_column (col1 varchar(11451) not null, col2 int not null, col3 int not null)