diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java index 4b5f161d2b..11197cfe62 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java @@ -44,6 +44,7 @@ public class AnalyzeProperties { public static final String PROPERTY_PERIOD_SECONDS = "period.seconds"; public static final String PROPERTY_FORCE_FULL = "force.full"; public static final String PROPERTY_PARTITION_COLUMN_FROM_SQL = "partition.column.from.sql"; + public static final String PROPERTY_USE_AUTO_ANALYZER = "use.auto.analyzer"; public static final AnalyzeProperties DEFAULT_PROP = new AnalyzeProperties(new HashMap() { { @@ -72,6 +73,7 @@ public class AnalyzeProperties { .add(PROPERTY_PERIOD_CRON) .add(PROPERTY_FORCE_FULL) .add(PROPERTY_PARTITION_COLUMN_FROM_SQL) + .add(PROPERTY_USE_AUTO_ANALYZER) .build(); public AnalyzeProperties(Map properties) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 55ec9938d9..12550899a4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1304,6 +1304,10 @@ public class OlapTable extends Table implements MTMVRelatedTableIf { return true; } + // Check new partition first loaded. + if (tblStats.newPartitionLoaded != null && tblStats.newPartitionLoaded.get()) { + return true; + } // 1 Check row count. long currentRowCount = getRowCount(); long lastAnalyzeRowCount = tblStats.rowCount; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 71f5ce0fa8..d4d0231505 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -204,7 +204,21 @@ public class AnalysisManager implements Writable { } // Each analyze stmt corresponding to an analysis job. - public void createAnalysisJob(AnalyzeTblStmt stmt, boolean proxy) throws DdlException { + public void createAnalysisJob(AnalyzeTblStmt stmt, boolean proxy) throws DdlException, AnalysisException { + // Using auto analyzer if user specifies. + if (stmt.getAnalyzeProperties().getProperties().containsKey("use.auto.analyzer")) { + StatisticsAutoCollector autoCollector = Env.getCurrentEnv().getStatisticsAutoCollector(); + if (autoCollector.skip(stmt.getTable())) { + return; + } + List jobs = new ArrayList<>(); + autoCollector.createAnalyzeJobForTbl(stmt.getDb(), jobs, stmt.getTable()); + AnalysisInfo job = autoCollector.getNeedAnalyzeColumns(jobs.get(0)); + if (job != null) { + Env.getCurrentEnv().getStatisticsAutoCollector().createSystemAnalysisJob(job); + } + return; + } AnalysisInfo jobInfo = buildAndAssignJob(stmt); if (jobInfo == null) { return; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index 4408a0d925..f76037d80c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -21,7 +21,6 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; @@ -41,7 +40,6 @@ import org.apache.logging.log4j.Logger; import java.time.LocalTime; import java.util.ArrayList; import java.util.List; -import java.util.Set; import java.util.StringJoiner; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -221,11 +219,6 @@ public class StatisticsAutoCollector extends StatisticsCollector { if (table.needReAnalyzeTable(tblStats)) { needRunColumns = table.getColumnIndexPairs(table.getSchemaAllIndexes(false) .stream().map(Column::getName).collect(Collectors.toSet())); - } else if (table instanceof OlapTable && tblStats.newPartitionLoaded.get()) { - OlapTable olapTable = (OlapTable) table; - Set partitionNames = olapTable.getAllPartitions().stream() - .map(Partition::getName).collect(Collectors.toSet()); - needRunColumns = olapTable.getColumnIndexPairs(partitionNames); } if (needRunColumns == null || needRunColumns.isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java index bb2f19ac25..2d38b7ccd3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -19,7 +19,6 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Pair; import org.apache.doris.common.io.Text; @@ -160,14 +159,6 @@ public class TableStatsMeta implements Writable, GsonPostProcessable { updatedRows.set(0); newPartitionLoaded.set(false); } - if (tableIf instanceof OlapTable) { - PartitionInfo partitionInfo = ((OlapTable) tableIf).getPartitionInfo(); - if (partitionInfo != null && analyzedJob.jobColumns - .containsAll(tableIf.getColumnIndexPairs(partitionInfo.getPartitionColumns().stream() - .map(Column::getName).collect(Collectors.toSet())))) { - newPartitionLoaded.set(false); - } - } } } @@ -180,6 +171,9 @@ public class TableStatsMeta implements Writable, GsonPostProcessable { if (indexesRowCount == null) { indexesRowCount = new ConcurrentHashMap<>(); } + if (newPartitionLoaded == null) { + newPartitionLoaded = new AtomicBoolean(false); + } } public long getRowCount(long indexId) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java index 8a803bd2a3..012ab383fd 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java @@ -285,12 +285,18 @@ public class AnalysisManagerTest { }; OlapTable olapTable = new OlapTable(); + TableStatsMeta stats0 = new TableStatsMeta( + 50, new AnalysisInfoBuilder().setJobColumns(new ArrayList<>()) + .setColName("col1").setRowCount(100).build(), olapTable); + stats0.newPartitionLoaded.set(true); + Assertions.assertTrue(olapTable.needReAnalyzeTable(stats0)); + TableStatsMeta stats1 = new TableStatsMeta( 50, new AnalysisInfoBuilder().setJobColumns(new ArrayList<>()) .setColName("col1").setRowCount(100).build(), olapTable); stats1.updatedRows.addAndGet(70); - Assertions.assertTrue(olapTable.needReAnalyzeTable(stats1)); + TableStatsMeta stats2 = new TableStatsMeta( 190, new AnalysisInfoBuilder().setJobColumns(new ArrayList<>()) .setColName("col1").setRowCount(200).build(), olapTable); diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 5a6e753e0a..09a8ecbcad 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -164,7 +164,73 @@ suite("test_analyze") { exception = e } - // Test sample agg table value column. Min max is N/A when zone map is not available. + // Test partition load data for the first time. + sql """ + CREATE TABLE `partition_test` ( + `id` INT NOT NULL, + `name` VARCHAR(25) NOT NULL, + `comment` VARCHAR(152) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + PARTITION BY RANGE(`id`) + (PARTITION p1 VALUES [("0"), ("100")), + PARTITION p2 VALUES [("100"), ("200")), + PARTITION p3 VALUES [("200"), ("300"))) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1"); + """ + + sql """analyze table partition_test with sync""" + sql """insert into partition_test values (1, '1', '1')""" + def partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + assertEquals(partition_result[0][0], "1") + sql """analyze table partition_test with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (101, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + sql """analyze table partition_test(id) with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + sql """analyze table partition_test with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (102, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (2, '2', '2')""" + sql """insert into partition_test values (3, '3', '3')""" + sql """insert into partition_test values (4, '4', '4')""" + sql """insert into partition_test values (5, '5', '5')""" + sql """insert into partition_test values (6, '6', '6')""" + sql """insert into partition_test values (7, '7', '7')""" + sql """insert into partition_test values (8, '8', '8')""" + sql """insert into partition_test values (9, '9', '9')""" + sql """insert into partition_test values (10, '10', '10')""" + sql """insert into partition_test values (103, '1', '1')""" + sql """insert into partition_test values (104, '1', '1')""" + sql """insert into partition_test values (105, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """analyze table partition_test with sync""" + sql """insert into partition_test values (201, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + partition_result = sql """show column stats partition_test(id)""" + assertEquals("id", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(name)""" + assertEquals("name", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(comment)""" + assertEquals("comment", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + + // Test sample agg table value column sql """ CREATE TABLE `agg_table_test` ( `id` BIGINT NOT NULL, @@ -184,6 +250,43 @@ suite("test_analyze") { assertEquals(agg_result[0][7], "N/A") assertEquals(agg_result[0][8], "N/A") + // Continue test partition load data for the first time. + def reported = false; + for (int i = 0; i < 10; i++) { + def data_result = sql """show data from partition_test""" + logger.info("show data from partition_test: " + data_result) + if (data_result[0][4] == '16') { + reported = true; + break; + } + sleep(1000) + } + if (!reported) { + logger.info("partition_test row count is not reported.") + } else { + sql """analyze table partition_test PROPERTIES("use.auto.analyzer"="true")""" + for (int i = 0; i < 10; i++) { + def auto_analyze_result = sql """show auto analyze partition_test""" + logger.info("show auto analyze partition_test result : " + auto_analyze_result) + if (auto_analyze_result[0][9] == 'FINISHED') { + logger.info("Auto analyze finished.") + auto_analyze_result = sql """show table stats partition_test""" + assertEquals(auto_analyze_result[0][6], "false") + auto_analyze_result = sql """show column stats partition_test(id)""" + assertEquals("id", auto_analyze_result[0][0]) + assertEquals("16.0", auto_analyze_result[0][2]) + auto_analyze_result = sql """show column stats partition_test(name)""" + assertEquals("name", auto_analyze_result[0][0]) + assertEquals("16.0", auto_analyze_result[0][2]) + auto_analyze_result = sql """show column stats partition_test(comment)""" + assertEquals("comment", auto_analyze_result[0][0]) + assertEquals("16.0", auto_analyze_result[0][2]) + break + } + sleep(1000) + } + } + def a_result_1 = sql """ ANALYZE DATABASE ${db} WITH SYNC WITH SAMPLE PERCENT 10 """ @@ -2595,42 +2698,6 @@ PARTITION `p599` VALUES IN (599) assertEquals("\'name1\'", result[0][7]) assertEquals("\'name3\'", result[0][8]) - // Test partititon load data for the first time. - sql """ - CREATE TABLE `partition_test` ( - `id` INT NOT NULL, - `name` VARCHAR(25) NOT NULL, - `comment` VARCHAR(152) NULL - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT 'OLAP' - PARTITION BY RANGE(`id`) - (PARTITION p1 VALUES [("0"), ("100")), - PARTITION p2 VALUES [("100"), ("200")), - PARTITION p3 VALUES [("200"), ("300"))) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1"); - """ - - sql """analyze table partition_test with sync""" - sql """insert into partition_test values (1, '1', '1')""" - def partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "true") - assertEquals(partition_result[0][0], "1") - sql """analyze table partition_test with sync""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - sql """insert into partition_test values (101, '1', '1')""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "true") - sql """analyze table partition_test(id) with sync""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - sql """insert into partition_test values (102, '1', '1')""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - // Test sample string type min max sql """ CREATE TABLE `string_min_max` ( @@ -2646,6 +2713,7 @@ PARTITION `p599` VALUES IN (599) """ sql """insert into string_min_max values (1,'name1'), (2, 'name2')""" sql """analyze table string_min_max with sync""" + explain { sql("select min(name), max(name) from string_min_max") contains "pushAggOp=NONE" @@ -2656,19 +2724,21 @@ PARTITION `p599` VALUES IN (599) contains "pushAggOp=MINMAX" } + sql """set forbid_unknown_col_stats=true""" + // Test alter sql """ - CREATE TABLE alter_test( - `id` int NOT NULL, - `name` VARCHAR(25) NOT NULL - )ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT "OLAP" - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1" - ); - """ + CREATE TABLE alter_test( + `id` int NOT NULL, + `name` VARCHAR(25) NOT NULL + )ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ); + """ sql """ANALYZE TABLE alter_test WITH SYNC""" def alter_result = sql """show table stats alter_test""" assertEquals("false", alter_result[0][7]) @@ -2710,17 +2780,17 @@ PARTITION `p599` VALUES IN (599) sql """CREATE DATABASE IF NOT EXISTS trigger""" sql """USE trigger""" sql """ - CREATE TABLE if not exists trigger_test( - `id` int NOT NULL, - `name` VARCHAR(152) - )ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT "OLAP" - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1" - ); - """ + CREATE TABLE if not exists trigger_test( + `id` int NOT NULL, + `name` VARCHAR(152) + )ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ); + """ // Test sample empty table def result_sample = sql """analyze table trigger_test with sample percent 10 with sync""" result_sample = sql """show column stats trigger_test""" @@ -2797,43 +2867,43 @@ PARTITION `p599` VALUES IN (599) // Test inject stats avg_size. sql """CREATE TABLE `date_dim` ( - `d_date_sk` BIGINT NOT NULL, - `d_date_id` CHAR(16) NOT NULL, - `d_date` DATE NULL, - `d_month_seq` INT NULL, - `d_week_seq` INT NULL, - `d_quarter_seq` INT NULL, - `d_year` INT NULL, - `d_dow` INT NULL, - `d_moy` INT NULL, - `d_dom` INT NULL, - `d_qoy` INT NULL, - `d_fy_year` INT NULL, - `d_fy_quarter_seq` INT NULL, - `d_fy_week_seq` INT NULL, - `d_day_name` CHAR(9) NULL, - `d_quarter_name` CHAR(6) NULL, - `d_holiday` CHAR(1) NULL, - `d_weekend` CHAR(1) NULL, - `d_following_holiday` CHAR(1) NULL, - `d_first_dom` INT NULL, - `d_last_dom` INT NULL, - `d_same_day_ly` INT NULL, - `d_same_day_lq` INT NULL, - `d_current_day` CHAR(1) NULL, - `d_current_week` CHAR(1) NULL, - `d_current_month` CHAR(1) NULL, - `d_current_quarter` CHAR(1) NULL, - `d_current_year` CHAR(1) NULL - ) ENGINE=OLAP - DUPLICATE KEY(`d_date_sk`) - DISTRIBUTED BY HASH(`d_date_sk`) BUCKETS 12 - PROPERTIES ( - "replication_allocation" = "tag.location.default: 1") + `d_date_sk` BIGINT NOT NULL, + `d_date_id` CHAR(16) NOT NULL, + `d_date` DATE NULL, + `d_month_seq` INT NULL, + `d_week_seq` INT NULL, + `d_quarter_seq` INT NULL, + `d_year` INT NULL, + `d_dow` INT NULL, + `d_moy` INT NULL, + `d_dom` INT NULL, + `d_qoy` INT NULL, + `d_fy_year` INT NULL, + `d_fy_quarter_seq` INT NULL, + `d_fy_week_seq` INT NULL, + `d_day_name` CHAR(9) NULL, + `d_quarter_name` CHAR(6) NULL, + `d_holiday` CHAR(1) NULL, + `d_weekend` CHAR(1) NULL, + `d_following_holiday` CHAR(1) NULL, + `d_first_dom` INT NULL, + `d_last_dom` INT NULL, + `d_same_day_ly` INT NULL, + `d_same_day_lq` INT NULL, + `d_current_day` CHAR(1) NULL, + `d_current_week` CHAR(1) NULL, + `d_current_month` CHAR(1) NULL, + `d_current_quarter` CHAR(1) NULL, + `d_current_year` CHAR(1) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`d_date_sk`) + DISTRIBUTED BY HASH(`d_date_sk`) BUCKETS 12 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1") """ sql """ - alter table date_dim modify column d_day_name set stats ('row_count'='73049', 'ndv'='7', 'num_nulls'='0', 'min_value'='Friday', 'max_value'='Wednesday', 'data_size'='521779') + alter table date_dim modify column d_day_name set stats ('row_count'='73049', 'ndv'='7', 'num_nulls'='0', 'min_value'='Friday', 'max_value'='Wednesday', 'data_size'='521779') """ alter_result = sql """show column cached stats date_dim""" @@ -2847,17 +2917,17 @@ PARTITION `p599` VALUES IN (599) // Test analyze after new empty partition created. sql """CREATE TABLE `part` ( - `id` INT NULL, - `colint` INT NULL + `id` INT NULL, + `colint` INT NULL ) ENGINE=OLAP DUPLICATE KEY(`id`) COMMENT 'OLAP' PARTITION BY RANGE(`id`) (PARTITION p1 VALUES [("-2147483648"), ("10000")), PARTITION p2 VALUES [("10000"), ("20000"))) - DISTRIBUTED BY HASH(`id`) BUCKETS 3 + DISTRIBUTED BY HASH(`id`) BUCKETS 3 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1" ); """