From 290070074a194f795ddd7389b823e97701f07a0c Mon Sep 17 00:00:00 2001 From: AKIRA <33112463+Kikyou1997@users.noreply.github.com> Date: Wed, 8 Nov 2023 12:03:44 +0900 Subject: [PATCH] [refactor](stats) refactor collection logic and opt some config (#26163) 1. not collect partition stats anymore 2. merge insert of stats 3. delete period collector since it is useless 4. remove enable_auto_sample 5. move some config related to stats to global session variable Before this PR, when analyze a table, the insert count equals column count times 2 After this PR, insert count of analyze table would reduce to column count / insert_merge_item_count. According to my test, when analyzing tpch lineitem, the insert sql count is 1 --- docs/en/docs/query-acceleration/statistics.md | 348 ++++++++++-------- .../docs/query-acceleration/statistics.md | 305 ++++++++------- .../java/org/apache/doris/common/Config.java | 34 +- .../java/org/apache/doris/catalog/Env.java | 7 - .../org/apache/doris/catalog/OlapTable.java | 7 +- .../doris/nereids/stats/StatsCalculator.java | 4 + .../org/apache/doris/qe/SessionVariable.java | 58 ++- .../apache/doris/statistics/AnalysisJob.java | 193 ++++++++++ .../doris/statistics/AnalysisManager.java | 90 ++--- .../statistics/AnalysisTaskExecutor.java | 28 +- .../doris/statistics/AnalysisTaskWrapper.java | 16 +- .../doris/statistics/BaseAnalysisTask.java | 108 ++---- .../apache/doris/statistics/ColStatsData.java | 14 + .../doris/statistics/HMSAnalysisTask.java | 135 +------ .../doris/statistics/JdbcAnalysisTask.java | 34 +- .../doris/statistics/MVAnalysisTask.java | 152 -------- .../doris/statistics/OlapAnalysisTask.java | 138 +------ .../doris/statistics/StatisticConstants.java | 14 +- .../statistics/StatisticsAutoCollector.java | 9 +- .../doris/statistics/StatisticsCollector.java | 11 +- .../statistics/StatisticsPeriodCollector.java | 50 --- .../org/apache/doris/statistics/StatsId.java | 15 +- .../doris/statistics/util/StatisticsUtil.java | 81 +++- .../doris/statistics/AnalysisJobTest.java | 287 +++++++++------ .../doris/statistics/AnalysisManagerTest.java | 37 +- .../statistics/AnalysisTaskExecutorTest.java | 16 +- .../apache/doris/statistics/AnalyzeTest.java | 185 ++++++++++ .../apache/doris/statistics/CacheTest.java | 40 +- .../statistics/OlapAnalysisTaskTest.java | 74 +++- .../StatisticsAutoCollectorTest.java | 73 ++++ .../statistics/util/StatisticsUtilTest.java | 46 ++- .../suites/statistics/analyze_stats.groovy | 100 ++++- .../statistics/test_agg_complex_type.groovy | 53 +++ 33 files changed, 1595 insertions(+), 1167 deletions(-) create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java create mode 100644 regression-test/suites/statistics/test_agg_complex_type.groovy diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index 069c25fb1a..c25054094b 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -26,33 +26,33 @@ under the License. # Statistics -## Introduction to statistics information +Collecting statistics helps the optimizer understand data distribution characteristics. When performing Cost-Based Optimization (CBO), the optimizer uses these statistics to calculate the selectivity of predicates and estimate the cost of each execution plan. This allows for the selection of more optimal plans, significantly improving query efficiency. -Collecting statistics helps the optimizer understand data distribution characteristics. When performing Cost-Based Optimization (CBO), the optimizer utilizes these statistics to calculate the selectivity of predicates and estimate the cost of each execution plan. This enables the selection of more efficient plans, significantly improving query performance. +Currently, the following information is collected for each column: -Currently, the collected column-level information includes: +| Information | Description | +| :----------------- | :------------------------------ | +| `row_count` | Total number of rows | +| `data_size` | Total data size | +| `avg_size_byte` | Average length of values | +| `ndv` | Number of distinct values | +| `min` | Minimum value | +| `max` | Maximum value | +| `null_count` | Number of null values | -| Information | Description | -| :--------------- | :------------------------ | -| `row_count` | Total number of rows | -| `data_size` | Total data size | -| `avg_size_byte` | Average length of values | -| `ndv` | Number of distinct values | -| `min` | Minimum value | -| `max` | Maximum value | -| `null_count` | Number of null values | -## Collecting Statistics +## 1. Collecting Statistics -### Using the ANALYZE Statement +--- -Doris supports users in triggering the collection and updating of statistics by submitting the ANALYZE statement. +### 1.1 Manual Collection Using ANALYZE Statement + +Doris allows users to manually trigger the collection and update of statistics by submitting the ANALYZE statement. Syntax: ```SQL -ANALYZE < TABLE | DATABASE table_name | db_name > - [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT ] ] +ANALYZE < TABLE | DATABASE table_name | db_name > [ (column_name [, ...]) ] [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH SQL ] ] [ PROPERTIES ("key" = "value", ...) ]; @@ -60,60 +60,110 @@ ANALYZE < TABLE | DATABASE table_name | db_name > Where: -- `table_name`: Specifies the target table. It can be in the `db_name.table_name` format. -- `partition_name`: The specified target partitions(for hive external table only)。Must be partitions exist in `table_name`. Multiple partition names are separated by commas. e.g. for single level partition: PARTITIONS(`event_date=20230706`), for multi level partition: PARTITIONS(`nation=US/city=Washington`). PARTITIONS(*) specifies all partitions, PARTITIONS WITH RECENT 30 specifies the latest 30 partitions. -- `column_name`: Specifies the target column. It must be an existing column in `table_name`, and multiple column names are separated by commas. -- `sync`: Collect statistics synchronously. Returns upon completion. If not specified, it executes asynchronously and returns a task ID. -- `sample percent | rows`: Collect statistics using sampling. You can specify either the sampling percentage or the number of sampled rows. -- `sql`: Collect statistics for external partition column with sql. By default, it uses meta data for partition columns, which is faster but may inaccurate for row count and size. Using sql could collect the accurate stats. +- `table_name`: The specified target table. It can be in the format `db_name.table_name`. +- `column_name`: The specified target column. It must be an existing column in `table_name`. You can specify multiple column names separated by commas. +- `sync`: Collect statistics synchronously. Returns after collection. If not specified, it executes asynchronously and returns a JOB ID. +- `sample percent | rows`: Collect statistics with sampling. You can specify a sampling percentage or a number of sampling rows. +- `sql`: Execute SQL to collect statistics for partitioned columns in external tables. By default, partitioned column information is collected from metadata, which is efficient but may not be accurate in terms of row count and data size. Users can specify using SQL to collect accurate partitioned column information. -### Automatic Statistics Collection +Here are some examples: -Users can enable this feature by setting the FE configuration option `enable_full_auto_analyze = true`. Once enabled, statistics on qualifying tables and columns will be automatically collected during specified time intervals. Users can specify the automatic collection time period by setting the `full_auto_analyze_start_time` (default is 00:00:00) and `full_auto_analyze_end_time` (default is 02:00:00) parameters. +Collect statistics for a table with a 10% sampling rate: -This feature collects statistics only for tables and columns that either have no statistics or have outdated statistics. When more than 20% of the data in a table is updated (this value can be configured using the `table_stats_health_threshold` parameter with a default of 80), Doris considers the statistics for that table to be outdated. +```sql +ANALYZE TABLE lineitem WITH SAMPLE PERCENT 10; +``` -For tables with a large amount of data (default is 5GiB), Doris will automatically use sampling to collect statistics, reducing the impact on the system and completing the collection job as quickly as possible. Users can adjust this behavior by setting the `huge_table_lower_bound_size_in_bytes` FE parameter. If you want to collect statistics for all tables in full, you can set the `enable_auto_sample` FE parameter to false. For tables with data size greater than `huge_table_lower_bound_size_in_bytes`, Doris ensures that the collection interval is not less than 12 hours (this time can be controlled using the `huge_table_auto_analyze_interval_in_millis` FE parameter). +Collect statistics for a table with a sample of 100,000 rows: -The default sample size for automatic sampling is 4194304(2^22) rows, but the actual sample size may be larger due to implementation reasons. If you want to sample more rows to obtain more accurate data distribution information, you can configure the `huge_table_default_sample_rows` FE parameter. +```sql +ANALYZE TABLE lineitem WITH SAMPLE ROWS 100000; +``` -### Task Management +
-#### Viewing Analyze Tasks +### 1.2 Automatic Collection -You can use `SHOW ANALYZE` to view information about statistics collection tasks. +This feature has been officially supported since 2.0.3 and is enabled by default. The basic operation logic is described below. After each import transaction commit, Doris records the number of rows updated by the import transaction to estimate the health of the existing table's statistics data (for tables that have not collected statistics, their health is 0). When the health of a table is below 60 (adjustable through the `table_stats_health_threshold` parameter), Doris considers the statistics for that table outdated and triggers statistics collection jobs for that table in subsequent operations. For tables with a health value above 60, no repeated collection is performed. + +The collection jobs for statistics themselves consume a certain amount of system resources. To minimize the overhead, for tables with a large amount of data (default 5 GiB, adjustable with the FE parameter `huge_table_lower_bound_size_in_bytes`), Doris automatically uses sampling to collect statistics. Automatic sampling defaults to sampling 4,194,304 (2^22) rows to reduce the system's burden and complete the collection job as quickly as possible. If you want to sample more rows to obtain a more accurate data distribution, you can increase the sampling row count by adjusting the `huge_table_default_sample_rows` parameter. In addition, for tables with data larger than `huge_table_lower_bound_size_in_bytes` * 5, Doris ensures that the collection time interval is not less than 12 hours (which can be controlled by adjusting the `huge_table_auto_analyze_interval_in_millis` parameter). + +If you are concerned about automatic collection jobs interfering with your business, you can specify a time frame for the automatic collection jobs to run during low business loads by setting the `full_auto_analyze_start_time` and `full_auto_analyze_end_time` parameters according to your needs. You can also completely disable this feature by setting the `enable_full_auto_analyze` parameter to `false`. + +
+ + + +## 2. Job Management + +--- + +### 2.1 View Analyze Jobs + +Use `SHOW ANALYZE` to view information about statistics collection jobs. Syntax: ```SQL -SHOW ANALYZE < table_name | job_id > +SHOW [AUTO] ANALYZE < table_name | job_id > [ WHERE [ STATE = [ "PENDING" | "RUNNING" | "FINISHED" | "FAILED" ] ] ]; ``` -- `table_name`: Specifies the table for which you want to view statistics collection tasks. It can be in the form of `db_name.table_name`. If not specified, it returns information for all statistics collection tasks. -- `job_id`: The job ID of the statistics information task returned when executing `ANALYZE`. If not specified, it returns information for all statistics collection tasks. +- AUTO: Show historical information for automatic collection jobs only. Note that, by default, the status of only the last 20,000 completed automatic collection jobs is retained. +- table_name: Table name, specify to view statistics job information for that table. It can be in the format `db_name.table_name`. When not specified, it returns information for all statistics jobs. +- job_id: Job ID for statistics collection, obtained when executing `ANALYZE`. When not specified, this command returns information for all statistics jobs. Output: -| Column Name | Description | -| :-------------------- | :------------- | -| `job_id` | Job ID | -| `catalog_name` | Catalog Name | -| `db_name` | Database Name | -| `tbl_name` | Table Name | -| `col_name` | Column Name | -| `job_type` | Job Type | -| `analysis_type` | Analysis Type | -| `message` | Task Message | -| `last_exec_time_in_ms`| Last Execution Time | -| `state` | Task State | -| `schedule_type` | Schedule Type | +| Column Name | Description | +| :--------------------- | :--------------- | +| `job_id` | Job ID | +| `catalog_name` | Catalog Name | +| `db_name` | Database Name | +| `tbl_name` | Table Name | +| `col_name` | Column Name List | +| `job_type` | Job Type | +| `analysis_type` | Analysis Type | +| `message` | Job Information | +| `last_exec_time_in_ms` | Last Execution Time | +| `state` | Job Status | +| `schedule_type` | Scheduling Method | +Here's an example: -You can use `SHOW ANALYZE TASK STATUS [job_id]` to check the completion status of collecting statistics for each column. +```sql +mysql> show analyze 245073\G; +*************************** 1. row *************************** + job_id: 245073 + catalog_name: internal + db_name: default_cluster:tpch + tbl_name: lineitem + col_name: [l_returnflag,l_receiptdate,l_tax,l_shipmode,l_suppkey,l_shipdate,l_commitdate,l_partkey,l_orderkey,l_quantity,l_linestatus,l_comment,l_extendedprice,l_linenumber,l_discount,l_shipinstruct] + job_type: MANUAL + analysis_type: FUNDAMENTALS + message: +last_exec_time_in_ms: 2023-11-07 11:00:52 + state: FINISHED + progress: 16 Finished | 0 Failed | 0 In Progress | 16 Total + schedule_type: ONCE +``` + +
+ +### 2.2 View Column Statistics Collection Status + +Each collection job can contain one or more tasks, with each task corresponding to the collection of a column. Users can use the following command to view the completion status of statistics collection for each column. + +Syntax: + +```sql +SHOW ANALYZE TASK STATUS [job_id] +``` + +Here's an example: ``` -mysql> show analyze task status 20038; +mysql> show analyze task status 20038 ; +---------+----------+---------+----------------------+----------+ | task_id | col_name | message | last_exec_time_in_ms | state | +---------+----------+---------+----------------------+----------+ @@ -124,55 +174,11 @@ mysql> show analyze task status 20038; +---------+----------+---------+----------------------+----------+ ``` -#### Terminating Analyze Tasks +
-You can terminate running statistics collection tasks using `KILL ANALYZE`. +### 2.3 View Column Statistics -Syntax: - -```SQL -KILL ANALYZE job_id; -``` - -- `job_id`: The job ID of the statistics information task. It is returned when executing `ANALYZE`, or you can obtain it using the `SHOW ANALYZE` statement. - -Example: - -- Terminating statistics collection task with job ID 52357. - -```SQL -mysql> KILL ANALYZE 52357; -``` - -#### Viewing Statistics Information - -#### Table Statistics Information - -You can use `SHOW TABLE STATS` to view an overview of statistics collection for a table. - -Syntax: - -```SQL -SHOW TABLE STATS table_name; -``` - -- `table_name`: The name of the table for which you want to view statistics collection information. It can be in the form of `db_name.table_name`. - -Output: - -| Column Name | Description | -| :--------------- | :------------------------------------- | -| `row_count` | Number of rows (may not be the exact count at the time of execution) | -| `method` | Collection method (FULL/SAMPLE) | -| `type` | Type of statistics data | -| `updated_time` | Last update time | -| `columns` | Columns for which statistics were collected | -| `trigger` | Trigger method for statistics collection (Auto/User) | - - -#### Viewing Column Statistics Information - -You can use `SHOW COLUMN [cached] STATS` to view information about the number of distinct values and NULLs in columns. +Use `SHOW COLUMN STATS` to view various statistics data for columns. Syntax: @@ -180,100 +186,140 @@ Syntax: SHOW COLUMN [cached] STATS table_name [ (column_name [, ...]) ]; ``` -- `cached`: Displays statistics information from the current FE memory cache. -- `table_name`: The name of the table for which you want to view column statistics information. It can be in the form of `db_name.table_name`. -- `column_name`: The specific column(s) you want to view statistics for. It must be a column that exists in `table_name`, and multiple column names can be separated by commas. +Where: -#### Modifying Statistics Information +- cached: Show statistics information in the current FE memory cache. +- table_name: The target table for collecting statistics. It can be in the format `db_name.table_name`. +- column_name: Specifies the target column, which must be an existing column in `table_name`. You can specify multiple column names separated by commas. -Users can adjust statistics information using the `ALTER` statement. +Here's an example: -```SQL -ALTER TABLE table_name MODIFY COLUMN column_name SET STATS ('stat_name' = 'stat_value', ...) [ PARTITION (partition_name) ]; +```sql +mysql> show column stats lineitem(l_tax)\G; +*************************** 1. row *************************** + column_name: l_tax + count: 6001215.0 + ndv: 9.0 + num_null: 0.0 + data_size: 4.800972E7 +avg_size_byte: 8.0 + min: 0.00 + max: 0.08 + method: FULL + type: FUNDAMENTALS + trigger: MANUAL + query_times: 0 + updated_time: 2023-11-07 11:00:46 ``` -- `table_name`: The name of the table for which you want to modify statistics information. It can be in the form of `db_name.table_name`. -- `column_name`: The specific column for which you want to modify statistics information. It must be a column that exists in `table_name`, and you can modify statistics information for one column at a time. -- `stat_name` and `stat_value`: The corresponding statistics information name and its value. Multiple statistics can be modified, separated by commas. You can modify statistics such as `row_count`, `ndv`, `num_nulls`, `min_value`, `max_value`, and `data_size`. +
-#### Delete Statistics +### 2.4 Table Collection Overview -Users can delete statistics using the `DROP` statement, which allows them to specify the table, partition, or column for which they want to delete statistics based on the provided parameters. When deleted, both column statistics and column histogram information are removed. +Use `SHOW TABLE STATS` to view an overview of statistics collection for a table. Syntax: ```SQL -DROP [ EXPIRED ] STATS [ table_name [ (column_name [, ...]) ] ]; +SHOW TABLE STATS table_name; ``` -#### Delete Analyze Job +Where: -Used to delete automatic/periodic Analyze jobs based on the job ID. +- table_name: The target table name. It can be in the format `db_name.table_name`. + +Output: + +| Column Name | Description | +| :--------------------- | :--------------- | +| `updated_rows` | Updated rows since the last ANALYZE | +| `query_times` | Reserved column for recording the number of times the table was queried in future versions | +| `row_count` | Number of rows (does not reflect the exact number of rows at the time of command execution) | +| `updated_time` | Last update time | +| `columns` | Columns for which statistics information has been collected | + +Here's an example: ```sql -DROP ANALYZE JOB [JOB_ID] +mysql> show table stats lineitem \G; +*************************** 1. row *************************** +updated_rows: 0 + query_times: 0 + row_count: 6001215 +updated_time: 2023-11-07 + columns: [l_returnflag, l_receiptdate, l_tax, l_shipmode, l_suppkey, l_shipdate, l_commitdate, l_partkey, l_orderkey, l_quantity, l_linestatus, l_comment, l_extendedprice, l_linenumber, l_discount, l_shipinstruct] + trigger: MANUAL ``` -### View Automatic Collection Task Execution Status +
-This command is used to check the completion status of automatic collection tasks after enabling automatic collection functionality. +### 2.5 Terminate Statistics Jobs -```sql -SHOW AUTO ANALYZE [table_name] - [ WHERE [ STATE = [ "PENDING" | "RUNNING" | "FINISHED" | "FAILED" ] ] ]; +Use `KILL ANALYZE` to terminate running statistics jobs. + +Syntax: + +```SQL +KILL ANALYZE job_id; ``` -Automatic collection tasks do not support viewing the completion status and failure reasons for each column individually. By default, it only retains the status of the last 20,000 completed automatic collection tasks. +Where: -## Configuration Options +- job_id: Job ID for statistics collection. Obtained when performing asynchronous statistics collection using the `ANALYZE` statement, and it can also be obtained through the `SHOW ANALYZE` statement. -| fe conf option | comment | default value | -|---------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| -| statistics_sql_parallel_exec_instance_num | Controls the number of concurrent instances/pipeline tasks for each statistics collection SQL on the BE side. | 1 | -| statistics_sql_mem_limit_in_bytes | Controls the amount of BE memory that each statistics collection SQL can use. | 2L * 1024 * 1024 * 1024 (2GiB) | -| statistics_simultaneously_running_task_num | After submitting asynchronous jobs using `ANALYZE TABLE[DATABASE]`, this parameter limits the number of columns that can be analyzed simultaneously. All asynchronous tasks are collectively constrained by this parameter. | 5 | -| analyze_task_timeout_in_minutes | Timeout for AnalyzeTask execution. | 12 hours | -| stats_cache_size| The actual memory usage of statistics cache depends heavily on the characteristics of the data because the average size of maximum/minimum values and the number of buckets in histograms can vary greatly in different datasets and scenarios. Additionally, factors like JVM versions can also affect it. Below is the memory size occupied by statistics cache with 100,000 items. The average length of maximum/minimum values per item is 32, the average length of column names is 16, and the statistics cache occupies a total of 61.2777404785MiB of memory. It is strongly discouraged to analyze columns with very large string values as this may lead to FE memory overflow. | 100000 | -|enable_auto_sample|Enable automatic sampling for large tables. When enabled, statistics will be automatically collected through sampling for tables larger than the `huge_table_lower_bound_size_in_bytes` threshold.| false| -|auto_analyze_job_record_count|Controls the persistence of records for automatically triggered statistics collection jobs.|20000| -|huge_table_default_sample_rows|Defines the number of sample rows for large tables when automatic sampling is enabled.|4194304| -|huge_table_lower_bound_size_in_bytes|Defines the lower size threshold for large tables. When `enable_auto_sample` is enabled, statistics will be automatically collected through sampling for tables larger than this value.|5368 709120| -|huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Within this interval, tables larger than `huge_table_lower_bound_size_in_bytes` will only be analyzed once.|43200000| -|table_stats_health_threshold|Takes a value between 0-100. When the data update volume reaches (100 - table_stats_health_threshold)% since the last statistics collection operation, the statistics for the table are considered outdated.|80| +Example: -|Session Variable|Description|Default Value| -|---|---|---| -|full_auto_analyze_start_time|Start time for automatic statistics collection|00:00:00| -|full_auto_analyze_end_time|End time for automatic statistics collection|02:00:00| -|enable_full_auto_analyze|Enable automatic collection functionality|true| +- Terminate statistics job with ID 52357. -ATTENTION: The session variables listed above must be set globally using SET GLOBAL. +```SQL +mysql> KILL ANALYZE 52357; +``` -## Usage Recommendations +
-Based on our testing, on tables with data size (i.e., actual storage space) below 128GiB, there is usually no need to modify the default configuration settings unless it is necessary to avoid resource contention during peak business hours by adjusting the execution time of the automatic collection feature. -Depending on the cluster configuration, automatic collection tasks typically consume around 20% of CPU resources. Therefore, users should adjust the execution time of the automatic collection feature to avoid resource contention during peak business hours, depending on their specific business needs. +```markdown +## 3. Session Variables and Configuration Options -Since ANALYZE is a resource-intensive operation, it is best to avoid executing such operations during peak business hours to prevent disruption to the business. Additionally, in cases of high cluster load, ANALYZE operations are more likely to fail. Furthermore, it is advisable to avoid performing full ANALYZE on the entire database or table. Typically, it is sufficient to perform ANALYZE on columns that are frequently used as predicate conditions, in JOIN conditions, as aggregation fields, or as ID fields. If a user's SQL queries involve a large number of such operations and the table has no statistics or very outdated statistics, we recommend: +--- -* Performing ANALYZE on the columns involved in complex queries before submitting the complex query, as poorly planned complex queries can consume a significant amount of system resources and may lead to resource exhaustion or timeouts. -* If you have configured periodic data import routines for Doris, it is recommended to execute ANALYZE after the data import is complete to ensure that subsequent query planning can use the most up-to-date statistics. You can automate this setting using Doris's existing job scheduling framework. -* When significant changes occur in the table's data, such as creating a new table and completing data import, it is recommended to run ANALYZE on the corresponding table. +### 3.1 Session Variables -## Common Issues +| Session Variable | Description | Default Value | +| ----------------------------- | -------------------------------------------- | ------------- | +| full_auto_analyze_start_time | Start time for automatic statistics collection | 00:00:00 | +| full_auto_analyze_end_time | End time for automatic statistics collection | 23:59:59 | +| enable_full_auto_analyze | Enable automatic collection functionality | true | +| huge_table_default_sample_rows | Sampling rows for large tables | 4194304 | +| huge_table_lower_bound_size_in_bytes | Tables with size greater than this value will be automatically sampled during collection of statistics | 5368709120 | +| huge_table_auto_analyze_interval_in_millis | Controls the minimum time interval for automatic ANALYZE on large tables. Tables with sizes greater than `huge_table_lower_bound_size_in_bytes * 5` will be ANALYZEed only once within this time interval. | 43200000 | +| table_stats_health_threshold | Ranges from 0 to 100. If data updates since the last statistics collection exceed `(100 - table_stats_health_threshold)%`, the table's statistics are considered outdated. | 60 | +| analyze_timeout | Controls the timeout for synchronous ANALYZE in seconds | 43200 | -### ANALYZE WITH SYNC Execution Failed: Failed to analyze following columns... +
-The SQL execution time is controlled by the `query_timeout` session variable, which has a default value of 300 seconds. Statements like `ANALYZE DATABASE/TABLE` often take longer, easily exceeding this time limit and being canceled. It is recommended to increase the value of `query_timeout` based on the data volume of the ANALYZE object. +### 3.2 FE Configuration Options -### ANALYZE Submission Error: Stats table not available... +The following FE configuration options are typically not a major concern: + +| FE Configuration Option | Description | Default Value | +| ---------------------------------- | ---------------------------------------- | ------------- | +| analyze_record_limit | Controls the persistence of statistics job execution records | 20000 | +| stats_cache_size | FE-side statistics cache entries | 500,000 | +| statistics_simultaneously_running_task_num | Number of asynchronous jobs that can run simultaneously | 3 | +| statistics_sql_mem_limit_in_bytes | Controls the amount of BE memory each statistics SQL can use | 2,147,483,648 bytes (2 GiB) | + +
+``` + +## 4. Common Issues + +### 4.1 ANALYZE Submission Error: Stats table not available... When ANALYZE is executed, statistics data is written to the internal table `__internal_schema.column_statistics`. FE checks the tablet status of this table before executing ANALYZE. If there are unavailable tablets, the task is rejected. Please check the BE cluster status if this error occurs. Users can use `SHOW BACKENDS\G` to verify the BE (Backend) status. If the BE status is normal, you can use the command `ADMIN SHOW REPLICA STATUS FROM __internal_schema.[tbl_in_this_db]` to check the tablet status within this database, ensuring that the tablet status is also normal. -### Failure of ANALYZE on Large Tables +### 4.2 Failure of ANALYZE on Large Tables Due to resource limitations, ANALYZE on some large tables may timeout or exceed BE memory limits. In such cases, it is recommended to use `ANALYZE ... WITH SAMPLE...`. - diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index d9aac9b678..7700ae3db4 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -26,7 +26,6 @@ under the License. # 统计信息 - 通过收集统计信息有助于优化器了解数据分布特性,在进行CBO(基于成本优化)时优化器会利用这些统计信息来计算谓词的选择性,并估算每个执行计划的成本。从而选择更优的计划以大幅提升查询效率。 当前收集列的如下信息: @@ -41,17 +40,21 @@ under the License. | `max` | 最⼤值 | | `null_count` | 空值数量 | -## 收集统计信息 +
-### 使用ANALYZE语句 -Doris支持用户通过提交ANALYZE语句来触发统计信息的收集和更新。 +## 1. 收集统计信息 + +--- + +### 1.1 使用ANALYZE语句手动收集 + +Doris支持用户通过提交ANALYZE语句来手动触发统计信息的收集和更新。 语法: ```SQL ANALYZE < TABLE | DATABASE table_name | db_name > - [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT ] ] [ (column_name [, ...]) ] [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH SQL ] ] [ PROPERTIES ("key" = "value", ...) ]; @@ -59,61 +62,109 @@ ANALYZE < TABLE | DATABASE table_name | db_name > 其中: -- table_name: 指定的的目标表。可以是  `db_name.table_name`  形式。 -- partition_name: 指定的目标分区(目前只针对Hive外表)。必须是  `table_name`  中存在的分区,多个列名称用逗号分隔。分区名样例: 单层分区PARTITIONS(`event_date=20230706`),多层分区PARTITIONS(`nation=CN/city=Beijing`)。PARTITIONS (*)指定所有分区,PARTITIONS WITH RECENT 100指定最新的100个分区。 +- table_name: 指定的目标表。可以是  `db_name.table_name`  形式。 - column_name: 指定的目标列。必须是  `table_name`  中存在的列,多个列名称用逗号分隔。 -- sync:同步收集统计信息。收集完后返回。若不指定则异步执行并返回任务 ID。 +- sync:同步收集统计信息。收集完后返回。若不指定则异步执行并返回JOB ID。 - sample percent | rows:抽样收集统计信息。可以指定抽样比例或者抽样行数。 - sql:执行sql来收集外表分区列统计信息。默认从元数据收集分区列信息,这样效率比较高但是行数和数据量大小可能不准。用户可以指定使用sql来收集,这样可以收集到准确的分区列信息。 -### 自动收集 +下面是一些例子 -用户可以通过设置FE配置项`enable_full_auto_analyze = true`来启用本功能。开启后,将在指定的时间段内自动收集满足条件的库表上的统计信息。用户可以通过设置参数`full_auto_analyze_start_time`(默认为00:00:00)和参数`full_auto_analyze_end_time`(默认为02:00:00)来指定自动收集的时间段。 +对一张表按照10%的比例采样收集统计数据: -此功能仅对没有统计信息或者统计信息过时的库表进行收集。当一个表的数据更新了20%(该值可通过参数`table_stats_health_threshold`(默认为80)配置)以上时,Doris会认为该表的统计信息已经过时。 +```sql +ANALYZE TABLE lineitem WITH SAMPLE PERCENT 10; +``` -对于数据量较大(默认为5GiB)的表,Doris会自动采取采样的方式去收集,以尽可能降低对系统造成的负担并尽快完成收集作业,用户可通过设置FE参数`huge_table_lower_bound_size_in_bytes`来调节此行为。如果希望对所有的表都采取全量收集,可配置FE参数`enable_auto_sample`为false。同时对于数据量大于`huge_table_lower_bound_size_in_bytes`的表,Doris保证其收集时间间隔不小于12小时(该时间可通过FE参数`huge_table_auto_analyze_interval_in_millis`控制)。 +对一张表按采样10万行收集统计数据 -自动采样默认采样4194304(2^22)行,但由于实现方式的原因实际采样数可能大于该值。如果希望采样更多的行以获得更准确的数据分布信息,可通过FE参数`huge_table_default_sample_rows`配置。 +```sql +ANALYZE TABLE lineitem WITH SAMPLE ROWS 100000; +``` -### 任务管理 +
-#### 查看统计任务 +### 1.2 自动收集 -通过 `SHOW ANALYZE` 来查看统计信息收集任务的信息。 +此功能从2.0.3开始正式支持,默认为全天开启状态。下面对其基本运行逻辑进行阐述,在每次导入事务提交后,Doris将记录本次导入事务更新的表行数用以估算当前已有表的统计数据的健康度(对于没有收集过统计数据的表,其健康度为0)。当表的健康度低于60(可通过参数`table_stats_health_threshold`调节)时,Doris会认为该表的统计信息已经过时,并在之后触发对该表的统计信息收集作业。而对于统计信息健康度高于60的表,则不会重复进行收集。 + +统计信息的收集作业本身需要占用一定的系统资源,为了尽可能降低开销,对于数据量较大(默认为5GiB,可通过设置FE参数`huge_table_lower_bound_size_in_bytes`来调节此行为)的表,Doris会自动采取采样的方式去收集,自动采样默认采样4194304(2^22)行,以尽可能降低对系统造成的负担并尽快完成收集作业。如果希望采样更多的行以获得更准确的数据分布信息,可通过调整参数`huge_table_default_sample_rows`增大采样行数。另外对于数据量大于`huge_table_lower_bound_size_in_bytes` * 5 的表,Doris保证其收集时间间隔不小于12小时(该时间可通过调整参数`huge_table_auto_analyze_interval_in_millis`控制)。 + +如果担心自动收集作业对业务造成干扰,可结合自身需求通过设置参数`full_auto_analyze_start_time`和参数`full_auto_analyze_end_time`指定自动收集作业在业务负载较低的时间段执行。也可以通过设置参数`enable_full_auto_analyze` 为`false`来彻底关闭本功能。 + +
+ +## 2. 作业管理 + +--- + +### 2.1 查看统计作业 + +通过 `SHOW ANALYZE` 来查看统计信息收集作业的信息。 语法如下: ```SQL -SHOW ANALYZE < table_name | job_id > +SHOW [AUTO] ANALYZE < table_name | job_id > [ WHERE [ STATE = [ "PENDING" | "RUNNING" | "FINISHED" | "FAILED" ] ] ]; ``` -- table_name:表名,指定后可查看该表对应的统计任务信息。可以是  `db_name.table_name`  形式。不指定时返回所有统计任务信息。 -- job_id:统计信息任务 ID,执行 `ANALYZE` 非同步收集统计信息时所返回的值。不指定时返回所有统计任务信息。 +- AUTO:仅仅展示自动收集历史作业信息。需要注意的是默认只保存过去20000个执行完毕的自动收集作业的状态。 +- table_name:表名,指定后可查看该表对应的统计作业信息。可以是  `db_name.table_name`  形式。不指定时返回所有统计作业信息。 +- job_id:统计信息作业 ID,执行 `ANALYZE` 异步收集时得到。不指定id时此命令返回所有统计作业信息。 输出: | 列名 | 说明 | | :--------------------- | :----------- | -| `job_id` | 统计任务 ID | +| `job_id` | 统计作业 ID | | `catalog_name` | catalog 名称 | | `db_name` | 数据库名称 | | `tbl_name` | 表名称 | -| `col_name` | 列名称 | -| `job_type` | 任务类型 | +| `col_name` | 列名称列表 | +| `job_type` | 作业类型 | | `analysis_type` | 统计类型 | -| `message` | 任务信息 | +| `message` | 作业信息 | | `last_exec_time_in_ms` | 上次执行时间 | -| `state` | 任务状态 | +| `state` | 作业状态 | | `schedule_type` | 调度方式 | +下面是一个例子: -可通过`SHOW ANALYZE TASK STATUS [job_id]`,查看具体每个列统计信息的收集完成情况。 +```sql +mysql> show analyze 245073\G; +*************************** 1. row *************************** + job_id: 245073 + catalog_name: internal + db_name: default_cluster:tpch + tbl_name: lineitem + col_name: [l_returnflag,l_receiptdate,l_tax,l_shipmode,l_suppkey,l_shipdate,l_commitdate,l_partkey,l_orderkey,l_quantity,l_linestatus,l_comment,l_extendedprice,l_linenumber,l_discount,l_shipinstruct] + job_type: MANUAL + analysis_type: FUNDAMENTALS + message: +last_exec_time_in_ms: 2023-11-07 11:00:52 + state: FINISHED + progress: 16 Finished | 0 Failed | 0 In Progress | 16 Total + schedule_type: ONCE +``` + +
+ +### 2.2 查看每列统计信息收集情况 + +每个收集作业中可以包含一到多个任务,每个任务对应一列的收集。用户可通过如下命令查看具体每列的统计信息收集完成情况。 + +语法: + +```sql +SHOW ANALYZE TASK STATUS [job_id] +``` + +下面是一个例子: ``` -mysql> show analyze task status 20038 ; +mysql> show analyze task status 20038 ; +---------+----------+---------+----------------------+----------+ | task_id | col_name | message | last_exec_time_in_ms | state | +---------+----------+---------+----------------------+----------+ @@ -126,60 +177,11 @@ mysql> show analyze task status 20038 ; ``` -#### 终止统计任务 +
-通过 `KILL ANALYZE` 来终止正在运行的统计任务。 +### 2.3 查看列统计信息 -语法如下: - -```SQL -KILL ANALYZE job_id; -``` - -其中: - -- job_id:统计信息任务 ID。执行 `ANALYZE` 非同步收集统计信息时所返回的值,也可以通过 `SHOW ANALYZE` 语句获取。 - -示例: - -- 终止 ID 为 52357 的统计任务。 - -```SQL -mysql> KILL ANALYZE 52357; -``` - -#### 查看统计信息 - -#### 表统计信息 - - -通过 `SHOW TABLE STATS` 表的统计信息收集概况。 - -语法如下: - -```SQL -SHOW TABLE STATS table_name; -``` - -其中: - -- table_name: 导入数据的目标表。可以是  `db_name.table_name`  形式。 - -输出: - -| 列名 | 说明 | -| :------------------ | :--------------------- | -| `row_count` | 行数(不反映命令执行时的准确行数) | -| `method` | 收集方式(全量/采样) | -| `type` | 统计数据的类型 | -| `updated_time` | 上次更新时间 | -| `columns` | 收集过统计信息的列 | -| `trigger` | 统计信息收集触发方式(系统自动触发/用户触发) | - - -#### 查看列统计信息 - -通过 `SHOW COLUMN STATS` 来查看列的不同值数以及 `NULL` 数量等信息。 +通过 `SHOW COLUMN STATS` 来查看列的各项统计数据。 语法如下: @@ -193,98 +195,137 @@ SHOW COLUMN [cached] STATS table_name [ (column_name [, ...]) ]; - table_name: 收集统计信息的目标表。可以是  `db_name.table_name`  形式。 - column_name: 指定的目标列,必须是  `table_name`  中存在的列,多个列名称用逗号分隔。 -#### 修改统计信息 +下面是一个例子: -⽤户可以通过 `ALTER` 语句调整统计信息。 +```sql +mysql> show column stats lineitem(l_tax)\G; +*************************** 1. row *************************** + column_name: l_tax + count: 6001215.0 + ndv: 9.0 + num_null: 0.0 + data_size: 4.800972E7 +avg_size_byte: 8.0 + min: 0.00 + max: 0.08 + method: FULL + type: FUNDAMENTALS + trigger: MANUAL + query_times: 0 + updated_time: 2023-11-07 11:00:46 + +``` + +
+ +### 2.4 表收集概况 + +通过 `SHOW TABLE STATS` 查看表的统计信息收集概况。 + +语法如下: ```SQL -ALTER TABLE table_name MODIFY COLUMN column_name SET STATS ('stat_name' = 'stat_value', ...) [ PARTITION (partition_name) ]; +SHOW TABLE STATS table_name; ``` 其中: -- table_name: 删除统计信息的目标表。可以是 `db_name.table_name` 形式。 -- column_name: 指定的目标列,必须是 `table_name` 中存在的列,每次只能修改一列的统计信息。 -- stat_name 和 stat_value: 相应的统计信息名称和统计信息信息的值,多个统计信息逗号分隔。可以修改的统计信息包括 `row_count`, `ndv`, `num_nulls`, `min_value`, `max_value`, `data_size`。 +- table_name: 目标表表名。可以是  `db_name.table_name`  形式。 -#### 删除统计信息 +输出: -⽤户通过 `DROP` 语句删除统计信息,根据提供的参数,删除指定的表、分区或列的统计信息。删除时会同时删除列统计信息和列直方图信息。 +| 列名 | 说明 | +| :------------------ | :--------------------- | +|`updated_rows`|自上次ANALYZE以来该表的更新行数| +|`query_times`|保留列,后续版本用以记录该表查询次数| +|`row_count`| 行数(不反映命令执行时的准确行数)| +|`updated_time`| 上次更新时间| +|`columns`| 收集过统计信息的列| +|`trigger`|触发方式| -语法: +下面是一个例子: + +```sql +mysql> show table stats lineitem \G; +*************************** 1. row *************************** +updated_rows: 0 + query_times: 0 + row_count: 6001215 +updated_time: 2023-11-07 + columns: [l_returnflag, l_receiptdate, l_tax, l_shipmode, l_suppkey, l_shipdate, l_commitdate, l_partkey, l_orderkey, l_quantity, l_linestatus, l_comment, l_extendedprice, l_linenumber, l_discount, l_shipinstruct] + trigger: MANUAL +``` + +
+ +### 2.5 终止统计作业 + +通过 `KILL ANALYZE` 来终止正在运行的统计作业。 + +语法如下: ```SQL -DROP [ EXPIRED ] STATS [ table_name [ (column_name [, ...]) ] ]; +KILL ANALYZE job_id; ``` +其中: -#### 删除Analyze Job +- job_id:统计信息作业 ID。执行 `ANALYZE` 异步收集统计信息时所返回的值,也可以通过 `SHOW ANALYZE` 语句获取。 -用于根据job id删除自动/周期Analyze作业 +示例: -```sql -DROP ANALYZE JOB [JOB_ID] +- 终止 ID 为 52357 的统计作业。 + +```SQL +mysql> KILL ANALYZE 52357; ``` -### 查看自动收集任务执行情况 +
-此命令用于打开自动收集功能后,查看自动收集任务的完成状态。 +## 3. 会话变量及配置项 -```sql -SHOW AUTO ANALYZE [ptable_name] - [ WHERE [ STATE = [ "PENDING" | "RUNNING" | "FINISHED" | "FAILED" ] ] ]; -``` +--- -自动收集任务不支持查看每个列的具完成情况及失败原因。默认只保存过去20000个执行完毕的自动收集任务的状态。 - -## 配置项 - -|fe conf option | comment | default value | -|---------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| -| statistics_sql_parallel_exec_instance_num | 控制每个统计信息收集SQL在BE侧的并发实例数/pipeline task num | 1 | -| statistics_sql_mem_limit_in_bytes | 控制每个统计信息SQL可占用的BE内存 | 2L * 1024 * 1024 * 1024 (2GiB) | -| statistics_simultaneously_running_task_num | 通过`ANALYZE TABLE[DATABASE]`提交异步作业后,可同时analyze的列的数量,所有异步任务共同受到该参数约束 | 5 | -| analyze_task_timeout_in_minutes | AnalyzeTask执行超时时间 | 12 hours | -|stats_cache_size| 统计信息缓存的实际内存占用大小高度依赖于数据的特性,因为在不同的数据集和场景中,最大/最小值的平均大小和直方图的桶数量会有很大的差异。此外,JVM版本等因素也会对其产生影响。下面给出统计信息缓存在包含100000个项目时所占用的内存大小。每个项目的最大/最小值的平均长度为32,列名的平均长度为16,统计信息缓存总共占用了61.2777404785MiB的内存。强烈不建议分析具有非常大字符串值的列,因为这可能导致FE内存溢出。 | 100000 | -|enable_auto_sample|是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集| false| -|auto_analyze_job_record_count|控制统计信息的自动触发作业执行记录的持久化行数|20000| -|huge_table_default_sample_rows|定义开启开启大表自动sample后,对大表的采样行数|4194304| -|huge_table_lower_bound_size_in_bytes|定义大表的大小下界,在开启enable_auto_sample的情况下,大小超过该值的表将会自动通过采样收集统计信息|5368 709120| -|huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次|43200000| -|table_stats_health_threshold|取值在0-100之间,当自上次统计信息收集操作之后,数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时|80| +### 3.1 会话变量 |会话变量|说明|默认值| |---|---|---| |full_auto_analyze_start_time|自动统计信息收集开始时间|00:00:00| -|full_auto_analyze_end_time|自动统计信息收集结束时间|02:00:00| +|full_auto_analyze_end_time|自动统计信息收集结束时间|23:59:59| |enable_full_auto_analyze|开启自动收集功能|true| +|huge_table_default_sample_rows|对大表的采样行数|4194304| +|huge_table_lower_bound_size_in_bytes|大小超过该值的的表,在自动收集时将会自动通过采样收集统计信息|5368709120| +|huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes * 5的表仅ANALYZE一次|43200000| +|table_stats_health_threshold|取值在0-100之间,当自上次统计信息收集操作之后,数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时|60| +|analyze_timeout|控制ANALYZE超时时间,单位为秒|43200| -注意:上面列出的会话变量必须通过`SET GLOBAL`全局设置。 +
-## 使用建议 +### 3.2 FE配置项 -根据我们的测试,在数据量(这里指实际存储占用的空间)为128GiB以下的表上,除自动收集功能执行时间段之外无须改动默认配置。 +下面的FE配置项通常情况下,无需关注 -依据集群配置情况,自动收集任务通常会占用20%左右的CPU资源,因此用户需要根据自己的业务情况,适当调整自动收集功能执行时间段以避开业务高峰期资源抢占。 +|FE配置项|说明|默认值| +|---|---|---| +|analyze_record_limit|控制统计信息作业执行记录的持久化行数|20000| +|stats_cache_size| FE侧统计信息缓存条数 | 500000 | +| statistics_simultaneously_running_task_num |可同时执行的异步作业数量|3| +| statistics_sql_mem_limit_in_bytes| 控制每个统计信息SQL可占用的BE内存| 2L * 1024 * 1024 * 1024 (2GiB) | -由于ANALYZE是资源密集型操作,因此最好尽可能不要在业务高峰期执行此类操作,从而避免对业务造成干扰,集群负载较高的情况下,ANALYZE操作也更容易失败。此外,基于相同的原因,我们建议用户避免全量的ANALYZE整库整表。通常来讲,只需要对经常作为谓词条件,JOIN条件,聚合字段以及ID字段的列进行ANALYZE就足够了。如果用户提交的SQL涉及到大量此类操作,并且表上也没有统计信息或者统计信息非常陈旧,那么我们建议: +
-* 在提交复杂查询之前先对涉及到的表列进行ANALYZE,因为规划不当的复杂查询将占用非常多的系统资源,非荣容易资源耗尽或超时而失败 -* 如果用户为Doris配置了周期性数据导入例程,那么建议在导入完毕后,执行ANALYZE从而保证后续查询规划能够利用到最新的统计数据。可以利用Doris已有的作业调度框架自动化完成此类设置 -* 当表的数据发生显著变化后,比如新建表并完成数据导入后,ANALYZE对应的表。 +## 4. 常见问题 -## 常见问题 +--- -### ANALYZE WITH SYNC 执行失败:Failed to analyze following columns... +### 4.1 ANALYZE提交报错:Stats table not available... -SQL执行时间受`query_timeout`会话变量控制,该变量默认值为300秒,`ANALYZE DATABASE/TABLE`等语句通常耗时较大,很容易超过该时间限制而被cancel,建议根据ANALYZE对象的数据量适当增大`query_timeout`的值。 - -### ANALYZE提交报错:Stats table not available... - -执行ANALYZE时统计数据会被写入到内部表`__internal_schema.column_statistics`中,FE会在执行ANALYZE前检查该表tablet状态,如果存在不可用的tablet则拒绝执行任务。出现该报错请检查BE集群状态。 +执行ANALYZE时统计数据会被写入到内部表`__internal_schema.column_statistics`中,FE会在执行ANALYZE前检查该表tablet状态,如果存在不可用的tablet则拒绝执行作业。出现该报错请检查BE集群状态。 用户可通过`SHOW BACKENDS\G`,确定BE状态是否正常。如果BE状态正常,可使用命令`ADMIN SHOW REPLICA STATUS FROM __internal_schema.[tbl_in_this_db]`,检查该库下tablet状态,确保tablet状态正常。 -### 大表ANALYZE失败 +
+ +### 4.2 大表ANALYZE失败 由于ANALYZE能够使用的资源受到比较严格的限制,对一些大表的ANALYZE操作有可能超时或者超出BE内存限制。这些情况下,建议使用 `ANALYZE ... WITH SAMPLE...`。 diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 2d02a2632b..c6970c7d17 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -17,8 +17,6 @@ package org.apache.doris.common; -import java.util.concurrent.TimeUnit; - public class Config extends ConfigBase { @ConfField(description = {"用户自定义配置文件的路径,用于存放 fe_custom.conf。该文件中的配置会覆盖 fe.conf 中的配置", @@ -1761,7 +1759,7 @@ public class Config extends ConfigBase { * Used to determined how many statistics collection SQL could run simultaneously. */ @ConfField - public static int statistics_simultaneously_running_task_num = 5; + public static int statistics_simultaneously_running_task_num = 3; /** * if table has too many replicas, Fe occur oom when schema change. @@ -2068,7 +2066,7 @@ public class Config extends ConfigBase { * FE OOM. */ @ConfField - public static long stats_cache_size = 10_0000; + public static long stats_cache_size = 50_0000; /** * This configuration is used to enable the statistics of query information, which will record @@ -2091,9 +2089,6 @@ public class Config extends ConfigBase { "Whether to enable binlog feature"}) public static boolean enable_feature_binlog = false; - @ConfField - public static int analyze_task_timeout_in_hours = 12; - @ConfField(mutable = true, masterOnly = true, description = { "是否禁止使用 WITH REOSOURCE 语句创建 Catalog。", "Whether to disable creating catalog with WITH RESOURCE statement."}) @@ -2159,9 +2154,6 @@ public class Config extends ConfigBase { @ConfField public static boolean forbid_running_alter_job = false; - @ConfField - public static int table_stats_health_threshold = 80; - @ConfField(description = { "暂时性配置项,开启后会自动将所有的olap表修改为可light schema change", "temporary config filed, will make all olap tables enable light schema change" @@ -2188,28 +2180,6 @@ public class Config extends ConfigBase { + "but it will increase the memory overhead."}) public static int virtual_node_number = 2048; - @ConfField(description = {"控制对大表的自动ANALYZE的最小时间间隔," - + "在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次", - "This controls the minimum time interval for automatic ANALYZE on large tables. Within this interval," - + "tables larger than huge_table_lower_bound_size_in_bytes are analyzed only once."}) - public static long huge_table_auto_analyze_interval_in_millis = TimeUnit.HOURS.toMillis(12); - - @ConfField(description = {"定义大表的大小下界,在开启enable_auto_sample的情况下," - + "大小超过该值的表将会自动通过采样收集统计信息", "This defines the lower size bound for large tables. " - + "When enable_auto_sample is enabled, tables larger than this value will automatically collect " - + "statistics through sampling"}) - public static long huge_table_lower_bound_size_in_bytes = 5L * 1024 * 1024 * 1024; - - @ConfField(description = {"定义开启开启大表自动sample后,对大表的采样比例", - "This defines the number of sample percent for large tables when automatic sampling for" - + "large tables is enabled"}) - public static int huge_table_default_sample_rows = 4194304; - - @ConfField(description = {"是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集" - + "统计信息", "Whether to enable automatic sampling for large tables, which, when enabled, automatically" - + "collects statistics through sampling for tables larger than 'huge_table_lower_bound_size_in_bytes'"}) - public static boolean enable_auto_sample = false; - @ConfField(description = { "控制统计信息的自动触发作业执行记录的持久化行数", "Determine the persist number of automatic triggered analyze job execution status" diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index 1cbdcc12a1..8c028bc8fc 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -230,7 +230,6 @@ import org.apache.doris.statistics.AnalysisManager; import org.apache.doris.statistics.StatisticsAutoCollector; import org.apache.doris.statistics.StatisticsCache; import org.apache.doris.statistics.StatisticsCleaner; -import org.apache.doris.statistics.StatisticsPeriodCollector; import org.apache.doris.statistics.query.QueryStats; import org.apache.doris.system.Backend; import org.apache.doris.system.Frontend; @@ -495,8 +494,6 @@ public class Env { private StatisticsAutoCollector statisticsAutoCollector; - private StatisticsPeriodCollector statisticsPeriodCollector; - private HiveTransactionMgr hiveTransactionMgr; private TopicPublisherThread topicPublisherThread; @@ -720,7 +717,6 @@ public class Env { this.analysisManager = new AnalysisManager(); this.statisticsCleaner = new StatisticsCleaner(); this.statisticsAutoCollector = new StatisticsAutoCollector(); - this.statisticsPeriodCollector = new StatisticsPeriodCollector(); this.globalFunctionMgr = new GlobalFunctionMgr(); this.workloadGroupMgr = new WorkloadGroupMgr(); this.queryStats = new QueryStats(); @@ -971,9 +967,6 @@ public class Env { if (statisticsAutoCollector != null) { statisticsAutoCollector.start(); } - if (statisticsPeriodCollector != null) { - statisticsPeriodCollector.start(); - } queryCancelWorker.start(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 16cec127bf..d9501f0f1d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -54,7 +54,6 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.HistogramTask; -import org.apache.doris.statistics.MVAnalysisTask; import org.apache.doris.statistics.OlapAnalysisTask; import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.statistics.util.StatisticsUtil; @@ -1122,11 +1121,9 @@ public class OlapTable extends Table { public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) { if (info.analysisType.equals(AnalysisType.HISTOGRAM)) { return new HistogramTask(info); - } - if (info.analysisType.equals(AnalysisType.FUNDAMENTALS)) { + } else { return new OlapAnalysisTask(info); } - return new MVAnalysisTask(info); } public boolean needReAnalyzeTable(TableStatsMeta tblStats) { @@ -1146,7 +1143,7 @@ public class OlapTable extends Table { } long updateRows = tblStats.updatedRows.get(); int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); - return tblHealth < Config.table_stats_health_threshold; + return tblHealth < StatisticsUtil.getTableStatsHealthThreshold(); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 0fad8ea1d1..61e7538774 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -569,6 +569,10 @@ public class StatsCalculator extends DefaultPlanVisitor { } private ColumnStatistic getColumnStatistic(TableIf table, String colName) { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getSessionVariable().internalSession) { + return ColumnStatistic.UNKNOWN; + } long catalogId; long dbId; try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 1bf6791298..9cb2b5edc8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -62,6 +62,7 @@ import java.util.Locale; import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; /** * System variable. @@ -430,6 +431,17 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_DECIMAL256 = "enable_decimal256"; + public static final String STATS_INSERT_MERGE_ITEM_COUNT = "stats_insert_merge_item_count"; + + public static final String HUGE_TABLE_DEFAULT_SAMPLE_ROWS = "huge_table_default_sample_rows"; + public static final String HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES = "huge_table_lower_bound_size_in_bytes"; + + public static final String HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS + = "huge_table_auto_analyze_interval_in_millis"; + + public static final String TABLE_STATS_HEALTH_THRESHOLD + = "table_stats_health_threshold"; + public static final List DEBUG_VARIABLES = ImmutableList.of( SKIP_DELETE_PREDICATE, SKIP_DELETE_BITMAP, @@ -486,7 +498,7 @@ public class SessionVariable implements Serializable, Writable { public int queryTimeoutS = 900; // query timeout in second. - @VariableMgr.VarAttr(name = ANALYZE_TIMEOUT, needForward = true) + @VariableMgr.VarAttr(name = ANALYZE_TIMEOUT, flag = VariableMgr.GLOBAL, needForward = true) public int analyzeTimeoutS = 43200; // The global max_execution_time value provides the default for the session value for new connections. @@ -1246,7 +1258,7 @@ public class SessionVariable implements Serializable, Writable { description = {"该参数定义自动ANALYZE例程的结束时间", "This parameter defines the end time for the automatic ANALYZE routine."}, flag = VariableMgr.GLOBAL) - public String fullAutoAnalyzeEndTime = "02:00:00"; + public String fullAutoAnalyzeEndTime = "23:59:59"; @VariableMgr.VarAttr(name = SQL_DIALECT, needForward = true, checker = "checkSqlDialect", description = {"解析sql使用的方言", "The dialect used to parse sql."}) @@ -1276,6 +1288,48 @@ public class SessionVariable implements Serializable, Writable { "the runtime filter id in IGNORE_RUNTIME_FILTER_IDS list will not be generated"}) public String ignoreRuntimeFilterIds = ""; + + @VariableMgr.VarAttr(name = STATS_INSERT_MERGE_ITEM_COUNT, flag = VariableMgr.GLOBAL, description = { + "控制统计信息相关INSERT攒批数量", "Controls the batch size for stats INSERT merging." + } + ) + public int statsInsertMergeItemCount = 200; + + @VariableMgr.VarAttr(name = HUGE_TABLE_DEFAULT_SAMPLE_ROWS, flag = VariableMgr.GLOBAL, description = { + "定义开启开启大表自动sample后,对大表的采样比例", + "This defines the number of sample percent for large tables when automatic sampling for" + + "large tables is enabled" + + }) + public long hugeTableDefaultSampleRows = 4194304; + + + @VariableMgr.VarAttr(name = HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES, flag = VariableMgr.GLOBAL, + description = { + "大小超过该值的表将会自动通过采样收集统计信息", + "This defines the lower size bound for large tables. " + + "When enable_auto_sample is enabled, tables" + + "larger than this value will automatically collect " + + "statistics through sampling"}) + public long hugeTableLowerBoundSizeInBytes = 5L * 1024 * 1024 * 1024; + + @VariableMgr.VarAttr(name = HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS, flag = VariableMgr.GLOBAL, + description = {"控制对大表的自动ANALYZE的最小时间间隔," + + "在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次", + "This controls the minimum time interval for automatic ANALYZE on large tables." + + "Within this interval," + + "tables larger than huge_table_lower_bound_size_in_bytes are analyzed only once."}) + public long hugeTableAutoAnalyzeIntervalInMillis = TimeUnit.HOURS.toMillis(12); + + @VariableMgr.VarAttr(name = TABLE_STATS_HEALTH_THRESHOLD, flag = VariableMgr.GLOBAL, + description = {"取值在0-100之间,当自上次统计信息收集操作之后" + + "数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时", + "The value should be between 0 and 100. When the data update quantity " + + "exceeds (100 - table_stats_health_threshold)% since the last " + + "statistics collection operation, the statistics for this table are" + + "considered outdated."}) + public int tableStatsHealthThreshold = 60; + public static final String IGNORE_RUNTIME_FILTER_IDS = "ignore_runtime_filter_ids"; public Set getIgnoredRuntimeFilterIds() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java new file mode 100644 index 0000000000..904dc21e33 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.catalog.Env; +import org.apache.doris.qe.AuditLogHelper; +import org.apache.doris.qe.AutoCloseConnectContext; +import org.apache.doris.qe.QueryState; +import org.apache.doris.qe.QueryState.MysqlStateType; +import org.apache.doris.qe.StmtExecutor; +import org.apache.doris.statistics.util.StatisticsUtil; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.StringJoiner; + +public class AnalysisJob { + + public static final Logger LOG = LogManager.getLogger(AnalysisJob.class); + + protected Set queryingTask; + + protected Set queryFinished; + + protected List buf; + + protected int totalTaskCount; + + protected int queryFinishedTaskCount; + + protected StmtExecutor stmtExecutor; + + protected boolean killed; + + protected long start; + + protected AnalysisInfo jobInfo; + + protected AnalysisManager analysisManager; + + public AnalysisJob(AnalysisInfo jobInfo, Collection queryingTask) { + for (BaseAnalysisTask task : queryingTask) { + task.job = this; + } + this.queryingTask = new HashSet<>(queryingTask); + this.queryFinished = new HashSet<>(); + this.buf = new ArrayList<>(); + totalTaskCount = queryingTask.size(); + start = System.currentTimeMillis(); + this.jobInfo = jobInfo; + this.analysisManager = Env.getCurrentEnv().getAnalysisManager(); + } + + public synchronized void appendBuf(BaseAnalysisTask task, List statsData) { + queryingTask.remove(task); + buf.addAll(statsData); + queryFinished.add(task); + queryFinishedTaskCount += 1; + if (queryFinishedTaskCount == totalTaskCount) { + writeBuf(); + updateTaskState(AnalysisState.FINISHED, "Cost time in sec: " + + (System.currentTimeMillis() - start) / 1000); + deregisterJob(); + } else if (buf.size() >= StatisticsUtil.getInsertMergeCount()) { + writeBuf(); + } + } + + // CHECKSTYLE OFF + // fallthrough here is expected + public void updateTaskState(AnalysisState state, String msg) { + long time = System.currentTimeMillis(); + switch (state) { + case FAILED: + for (BaseAnalysisTask task : queryingTask) { + analysisManager.updateTaskStatus(task.info, state, msg, time); + task.cancel(); + } + killed = true; + case FINISHED: + for (BaseAnalysisTask task : queryFinished) { + analysisManager.updateTaskStatus(task.info, state, msg, time); + } + default: + // DO NOTHING + } + } + + protected void writeBuf() { + if (killed) { + return; + } + // buf could be empty when nothing need to do, for example user submit an analysis task for table with no data + // change + if (!buf.isEmpty()) { + String insertStmt = "INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + " VALUES "; + StringJoiner values = new StringJoiner(","); + for (ColStatsData data : buf) { + values.add(data.toSQL(true)); + } + insertStmt += values.toString(); + int retryTimes = 0; + while (retryTimes < StatisticConstants.ANALYZE_TASK_RETRY_TIMES) { + if (killed) { + return; + } + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(false)) { + stmtExecutor = new StmtExecutor(r.connectContext, insertStmt); + executeWithExceptionOnFail(stmtExecutor); + break; + } catch (Exception t) { + LOG.warn("Failed to write buf: " + insertStmt, t); + retryTimes++; + if (retryTimes >= StatisticConstants.ANALYZE_TASK_RETRY_TIMES) { + updateTaskState(AnalysisState.FAILED, t.getMessage()); + return; + } + } + } + } + updateTaskState(AnalysisState.FINISHED, ""); + syncLoadStats(); + queryFinished.clear(); + } + + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + if (killed) { + return; + } + LOG.debug("execute internal sql: {}", stmtExecutor.getOriginStmt()); + try { + stmtExecutor.execute(); + QueryState queryState = stmtExecutor.getContext().getState(); + if (queryState.getStateType().equals(MysqlStateType.ERR)) { + throw new RuntimeException( + "Failed to insert : " + stmtExecutor.getOriginStmt().originStmt + "Error msg: " + + queryState.getErrorMessage()); + } + } finally { + AuditLogHelper.logAuditLog(stmtExecutor.getContext(), stmtExecutor.getOriginStmt().toString(), + stmtExecutor.getParsedStmt(), stmtExecutor.getQueryStatisticsForAuditLog(), + true); + } + } + + public void taskFailed(BaseAnalysisTask task, String reason) { + updateTaskState(AnalysisState.FAILED, reason); + cancel(); + deregisterJob(); + } + + public void cancel() { + for (BaseAnalysisTask task : queryingTask) { + task.cancel(); + } + } + + public void deregisterJob() { + analysisManager.removeJob(jobInfo.jobId); + } + + protected void syncLoadStats() { + long tblId = jobInfo.tblId; + for (BaseAnalysisTask task : queryFinished) { + String colName = task.col.getName(); + if (!Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tblId, -1, colName)) { + analysisManager.removeColStatsStatus(tblId, colName); + } + } + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index ef6cafa1f3..e4b1b01fae 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -42,7 +42,6 @@ import org.apache.doris.common.FeConstants; import org.apache.doris.common.ThreadPoolManager.BlockedPolicy; import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; -import org.apache.doris.common.util.Daemon; import org.apache.doris.common.util.Util; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.mysql.privilege.PrivPredicate; @@ -101,7 +100,7 @@ import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; -public class AnalysisManager extends Daemon implements Writable { +public class AnalysisManager implements Writable { private static final Logger LOG = LogManager.getLogger(AnalysisManager.class); @@ -113,11 +112,11 @@ public class AnalysisManager extends Daemon implements Writable { private AnalysisTaskExecutor taskExecutor; // Store task information in metadata. - private final NavigableMap analysisTaskInfoMap = + protected final NavigableMap analysisTaskInfoMap = Collections.synchronizedNavigableMap(new TreeMap<>()); // Store job information in metadata. - private final NavigableMap analysisJobInfoMap = + protected final NavigableMap analysisJobInfoMap = Collections.synchronizedNavigableMap(new TreeMap<>()); // Tracking system submitted job, keep in mem only @@ -128,6 +127,8 @@ public class AnalysisManager extends Daemon implements Writable { private final Map idToTblStats = new ConcurrentHashMap<>(); + private final Map idToAnalysisJob = new ConcurrentHashMap<>(); + protected SimpleQueue autoJobs = createSimpleQueue(null, this); private final Function userJobStatusUpdater = w -> { @@ -237,7 +238,6 @@ public class AnalysisManager extends Daemon implements Writable { new Function[] {userJobStatusUpdater, systemJobStatusUpdater}; public AnalysisManager() { - super(TimeUnit.SECONDS.toMillis(StatisticConstants.ANALYZE_MANAGER_INTERVAL_IN_SECS)); if (!Env.isCheckpointThread()) { this.taskExecutor = new AnalysisTaskExecutor(Config.statistics_simultaneously_running_task_num); this.statisticsCache = new StatisticsCache(); @@ -245,44 +245,6 @@ public class AnalysisManager extends Daemon implements Writable { } } - @Override - protected void runOneCycle() { - clear(); - } - - private void clear() { - clearExpiredAnalysisInfo(analysisJobInfoMap, (a) -> - a.scheduleType.equals(ScheduleType.ONCE) - && System.currentTimeMillis() - a.lastExecTimeInMs - > TimeUnit.DAYS.toMillis(StatisticConstants.ANALYSIS_JOB_INFO_EXPIRATION_TIME_IN_DAYS), - (id) -> { - Env.getCurrentEnv().getEditLog().logDeleteAnalysisJob(new AnalyzeDeletionLog(id)); - return null; - }); - clearExpiredAnalysisInfo(analysisTaskInfoMap, (a) -> System.currentTimeMillis() - a.lastExecTimeInMs - > TimeUnit.DAYS.toMillis(StatisticConstants.ANALYSIS_JOB_INFO_EXPIRATION_TIME_IN_DAYS), - (id) -> { - Env.getCurrentEnv().getEditLog().logDeleteAnalysisTask(new AnalyzeDeletionLog(id)); - return null; - }); - } - - private void clearExpiredAnalysisInfo(Map infoMap, Predicate isExpired, - Function writeLog) { - synchronized (infoMap) { - List expired = new ArrayList<>(); - for (Entry entry : infoMap.entrySet()) { - if (isExpired.test(entry.getValue())) { - expired.add(entry.getKey()); - } - } - for (Long k : expired) { - infoMap.remove(k); - writeLog.apply(k); - } - } - } - public StatisticsCache getStatisticsCache() { return statisticsCache; } @@ -371,6 +333,7 @@ public class AnalysisManager extends Daemon implements Writable { boolean isSync = stmt.isSync(); Map analysisTaskInfos = new HashMap<>(); createTaskForEachColumns(jobInfo, analysisTaskInfos, isSync); + constructJob(jobInfo, analysisTaskInfos.values()); if (!jobInfo.partitionOnly && stmt.isAllColumns() && StatisticsUtil.isExternalTable(jobInfo.catalogId, jobInfo.dbId, jobInfo.tblId)) { createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, isSync); @@ -446,7 +409,6 @@ public class AnalysisManager extends Daemon implements Writable { */ private Map> validateAndGetPartitions(TableIf table, Set columnNames, Set partitionNames, AnalysisType analysisType) throws DdlException { - long tableId = table.getId(); Map> columnToPartitions = columnNames.stream() .collect(Collectors.toMap( @@ -467,27 +429,6 @@ public class AnalysisManager extends Daemon implements Writable { return columnToPartitions; } - // Get the partition granularity statistics that have been collected - Map> existColAndPartsForStats = StatisticsRepository - .fetchColAndPartsForStats(tableId); - - if (existColAndPartsForStats.isEmpty()) { - // There is no historical statistical information, no need to do validation - return columnToPartitions; - } - - Set existPartIdsForStats = new HashSet<>(); - existColAndPartsForStats.values().forEach(existPartIdsForStats::addAll); - Set idToPartition = StatisticsUtil.getPartitionIds(table); - // Get an invalid set of partitions (those partitions were deleted) - Set invalidPartIds = existPartIdsForStats.stream() - .filter(id -> !idToPartition.contains(id)).collect(Collectors.toSet()); - - if (!invalidPartIds.isEmpty()) { - // Delete invalid partition statistics to avoid affecting table statistics - StatisticsRepository.dropStatistics(invalidPartIds); - } - if (analysisType == AnalysisType.FUNDAMENTALS) { Map> result = table.findReAnalyzeNeededPartitions(); result.keySet().retainAll(columnNames); @@ -721,11 +662,12 @@ public class AnalysisManager extends Daemon implements Writable { public void syncExecute(Collection tasks) { SyncTaskCollection syncTaskCollection = new SyncTaskCollection(tasks); ConnectContext ctx = ConnectContext.get(); + ThreadPoolExecutor syncExecPool = createThreadPoolForSyncAnalyze(); try { ctxToSyncTask.put(ctx, syncTaskCollection); - ThreadPoolExecutor syncExecPool = createThreadPoolForSyncAnalyze(); syncTaskCollection.execute(syncExecPool); } finally { + syncExecPool.shutdown(); ctxToSyncTask.remove(ctx); } } @@ -738,7 +680,7 @@ public class AnalysisManager extends Daemon implements Writable { new SynchronousQueue(), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("SYNC ANALYZE" + "-%d") .build(), new BlockedPolicy(poolName, - (int) TimeUnit.HOURS.toSeconds(Config.analyze_task_timeout_in_hours))); + StatisticsUtil.getAnalyzeTimeout())); } public void dropStats(DropStatsStmt dropStatsStmt) throws DdlException { @@ -760,6 +702,7 @@ public class AnalysisManager extends Daemon implements Writable { for (String col : cols) { Env.getCurrentEnv().getStatisticsCache().invalidate(tblId, -1L, col); } + tableStats.updatedTime = 0; } logCreateTableStats(tableStats); StatisticsRepository.dropStatistics(tblId, cols); @@ -1129,4 +1072,17 @@ public class AnalysisManager extends Daemon implements Writable { } return tableStats.findColumnStatsMeta(colName); } + + public AnalysisJob findJob(long id) { + return idToAnalysisJob.get(id); + } + + public void constructJob(AnalysisInfo jobInfo, Collection tasks) { + AnalysisJob job = new AnalysisJob(jobInfo, tasks); + idToAnalysisJob.put(jobInfo.jobId, job); + } + + public void removeJob(long id) { + idToAnalysisJob.remove(id); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java index 4b133ce0eb..58bae9fe66 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java @@ -18,9 +18,9 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Env; -import org.apache.doris.common.Config; import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.common.ThreadPoolManager.BlockedPolicy; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -36,7 +36,7 @@ public class AnalysisTaskExecutor extends Thread { private static final Logger LOG = LogManager.getLogger(AnalysisTaskExecutor.class); - private final ThreadPoolExecutor executors; + protected final ThreadPoolExecutor executors; private final BlockingQueue taskQueue = new PriorityBlockingQueue(20, @@ -72,18 +72,22 @@ public class AnalysisTaskExecutor extends Thread { private void doCancelExpiredJob() { for (;;) { + tryToCancel(); + } + } + + protected void tryToCancel() { + try { + AnalysisTaskWrapper taskWrapper = taskQueue.take(); try { - AnalysisTaskWrapper taskWrapper = taskQueue.take(); - try { - long timeout = TimeUnit.HOURS.toMillis(Config.analyze_task_timeout_in_hours) - - (System.currentTimeMillis() - taskWrapper.getStartTime()); - taskWrapper.get(timeout < 0 ? 0 : timeout, TimeUnit.MILLISECONDS); - } catch (Exception e) { - taskWrapper.cancel(e.getMessage()); - } - } catch (Throwable throwable) { - LOG.warn(throwable); + long timeout = TimeUnit.SECONDS.toMillis(StatisticsUtil.getAnalyzeTimeout()) + - (System.currentTimeMillis() - taskWrapper.getStartTime()); + taskWrapper.get(timeout < 0 ? 0 : timeout, TimeUnit.MILLISECONDS); + } catch (Exception e) { + taskWrapper.cancel(e.getMessage()); } + } catch (Throwable throwable) { + LOG.warn("cancel analysis task failed", throwable); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java index 9aa3d85992..ffdd375ee9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java @@ -17,7 +17,6 @@ package org.apache.doris.statistics; -import org.apache.doris.catalog.Env; import org.apache.doris.common.util.TimeUtils; import org.apache.doris.common.util.Util; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; @@ -59,9 +58,8 @@ public class AnalysisTaskWrapper extends FutureTask { if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.inAnalyzeTime( LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { // TODO: Do we need a separate AnalysisState here? - Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(task.info, AnalysisState.FAILED, "Auto task" - + "doesn't get executed within specified time range", System.currentTimeMillis()); + task.job.taskFailed(task, "Auto task" + + "doesn't get executed within specified time range"); return; } executor.putJob(this); @@ -76,15 +74,7 @@ public class AnalysisTaskWrapper extends FutureTask { if (!task.killed) { if (except != null) { LOG.warn("Analyze {} failed.", task.toString(), except); - Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(task.info, - AnalysisState.FAILED, Util.getRootCauseMessage(except), System.currentTimeMillis()); - } else { - LOG.debug("Analyze {} finished, cost time:{}", task.toString(), - System.currentTimeMillis() - startTime); - Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(task.info, - AnalysisState.FINISHED, "", System.currentTimeMillis()); + task.job.taskFailed(task, Util.getRootCauseMessage(except)); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 4f7d588de7..3fcebd6c38 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -22,14 +22,12 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TableIf; -import org.apache.doris.common.Config; import org.apache.doris.datasource.CatalogIf; -import org.apache.doris.qe.AuditLogHelper; -import org.apache.doris.qe.QueryState; -import org.apache.doris.qe.QueryState.MysqlStateType; +import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; +import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.util.DBObjects; import org.apache.doris.statistics.util.StatisticsUtil; @@ -38,6 +36,7 @@ import com.google.common.base.Preconditions; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.util.Collections; import java.util.concurrent.TimeUnit; public abstract class BaseAnalysisTask { @@ -52,59 +51,25 @@ public abstract class BaseAnalysisTask { + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " ; - /** - * Stats stored in the column_statistics table basically has two types, `part_id` is null which means it is - * aggregate from partition level stats, `part_id` is not null which means it is partition level stats. - * For latter, it's id field contains part id, for previous doesn't. - */ - protected static final String INSERT_PART_STATISTICS = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " - + "${catalogId} AS catalog_id, " - + "${dbId} AS db_id, " - + "${tblId} AS tbl_id, " - + "${idxId} AS idx_id, " - + "'${colId}' AS col_id, " - + "${partId} AS part_id, " - + "COUNT(1) AS row_count, " - + "NDV(`${colName}`) AS ndv, " - + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " - + "${dataSizeFunction} AS data_size, " - + "NOW() "; - - protected static final String INSERT_COL_STATISTICS = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, " - + " ndv, null_count," - + " to_base64(CAST(min AS string)), to_base64(CAST(max AS string)), data_size, update_time\n" - + " FROM \n" - + " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + protected static final String COLLECT_COL_STATISTICS = + "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + " ${catalogId} AS catalog_id, " + " ${dbId} AS db_id, " + " ${tblId} AS tbl_id, " + " ${idxId} AS idx_id, " + " '${colId}' AS col_id, " + " NULL AS part_id, " - + " SUM(count) AS row_count, \n" - + " SUM(null_count) AS null_count, " - + " MIN(CAST(from_base64(min) AS ${type})) AS min, " - + " MAX(CAST(from_base64(max) AS ${type})) AS max, " - + " SUM(data_size_in_bytes) AS data_size, " - + " NOW() AS update_time \n" - + " FROM ${internalDB}.${columnStatTbl}" - + " WHERE ${internalDB}.${columnStatTbl}.db_id = '${dbId}' AND " - + " ${internalDB}.${columnStatTbl}.tbl_id='${tblId}' AND " - + " ${internalDB}.${columnStatTbl}.col_id='${colId}' AND " - + " ${internalDB}.${columnStatTbl}.idx_id='${idxId}' AND " - + " ${internalDB}.${columnStatTbl}.part_id IS NOT NULL" - + " ) t1, \n"; + + " COUNT(1) AS row_count, " + + " NDV(`${colName}`) AS ndv, " + + " COUNT(1) - COUNT(${colName}) AS null_count, " + + " CAST(MIN(${colName}) AS STRING) AS min, " + + " CAST(MAX(${colName}) AS STRING) AS max, " + + " ${dataSizeFunction} AS data_size, " + + " NOW() AS update_time " + + " FROM `${dbName}`.`${tblName}`"; - protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " + protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = + " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + "${catalogId} AS catalog_id, " + "${dbId} AS db_id, " @@ -115,8 +80,8 @@ public abstract class BaseAnalysisTask { + "${row_count} AS row_count, " + "${ndv} AS ndv, " + "${null_count} AS null_count, " - + "to_base64('${min}') AS min, " - + "to_base64('${max}') AS max, " + + "'${min}' AS min, " + + "'${max}' AS max, " + "${data_size} AS data_size, " + "NOW() "; @@ -136,6 +101,8 @@ public abstract class BaseAnalysisTask { protected TableSample tableSample = null; + protected AnalysisJob job; + @VisibleForTesting public BaseAnalysisTask() { @@ -192,6 +159,7 @@ public abstract class BaseAnalysisTask { } LOG.warn("Failed to execute analysis task, retried times: {}", retriedTimes++, t); if (retriedTimes > StatisticConstants.ANALYZE_TASK_RETRY_TIMES) { + job.taskFailed(this, t.getMessage()); throw new RuntimeException(t); } StatisticsUtil.sleep(TimeUnit.SECONDS.toMillis(2 ^ retriedTimes) * 10); @@ -266,11 +234,10 @@ public abstract class BaseAnalysisTask { return new TableSample(true, (long) info.samplePercent); } else if (info.sampleRows > 0) { return new TableSample(false, info.sampleRows); - } else if (info.analysisMethod == AnalysisMethod.FULL - && Config.enable_auto_sample - && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { + } else if (info.jobType.equals(JobType.SYSTEM) && info.analysisMethod == AnalysisMethod.FULL + && tbl.getDataSize(true) > StatisticsUtil.getHugeTableLowerBoundSizeInBytes()) { // If user doesn't specify sample percent/rows, use auto sample and update sample rows in analysis info. - return new TableSample(false, (long) Config.huge_table_default_sample_rows); + return new TableSample(false, StatisticsUtil.getHugeTableSampleRows()); } else { return null; } @@ -283,23 +250,20 @@ public abstract class BaseAnalysisTask { col == null ? "TableRowCount" : col.getName()); } - protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { - if (killed) { - return; - } - LOG.debug("execute internal sql: {}", stmtExecutor.getOriginStmt()); - try { - stmtExecutor.execute(); - QueryState queryState = stmtExecutor.getContext().getState(); - if (queryState.getStateType().equals(MysqlStateType.ERR)) { - throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", - catalog.getName(), db.getFullName(), info.colName, stmtExecutor.getOriginStmt().toString(), - queryState.getErrorMessage())); - } + public void setJob(AnalysisJob job) { + this.job = job; + } + + protected void runQuery(String sql) { + long startTime = System.currentTimeMillis(); + try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) { + stmtExecutor = new StmtExecutor(a.connectContext, sql); + stmtExecutor.executeInternalQuery(); + ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); + job.appendBuf(this, Collections.singletonList(colStatsData)); } finally { - AuditLogHelper.logAuditLog(stmtExecutor.getContext(), stmtExecutor.getOriginStmt().toString(), - stmtExecutor.getParsedStmt(), stmtExecutor.getQueryStatisticsForAuditLog(), - true); + LOG.debug("End cost time in secs: " + (System.currentTimeMillis() - startTime) / 1000); } } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index 6c94326a94..41936232af 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -19,6 +19,8 @@ package org.apache.doris.statistics; import org.apache.doris.statistics.util.StatisticsUtil; +import com.google.common.annotations.VisibleForTesting; + import java.nio.charset.StandardCharsets; import java.util.Base64; import java.util.StringJoiner; @@ -54,6 +56,18 @@ public class ColStatsData { public final String updateTime; + @VisibleForTesting + public ColStatsData() { + statsId = new StatsId(); + count = 0; + ndv = 0; + nullCount = 0; + minLit = null; + maxLit = null; + dataSizeInBytes = 0; + updateTime = null; + } + public ColStatsData(ResultRow row) { this.statsId = new StatsId(row); this.count = (long) Double.parseDouble(row.get(7)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 4583237f8c..049e80d52f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -23,26 +23,19 @@ import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.FeConstants; import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.external.hive.util.HiveUtil; -import org.apache.doris.qe.AutoCloseConnectContext; -import org.apache.doris.qe.QueryState; -import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.util.StatisticsUtil; -import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import java.util.ArrayList; import java.util.Collections; -import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; -import java.util.StringJoiner; import java.util.stream.Collectors; public class HMSAnalysisTask extends BaseAnalysisTask { @@ -51,9 +44,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { // While doing sample analysis, the sampled ndv result will multiply a factor (total size/sample size) // if ndv(col)/count(col) is greater than this threshold. - private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " + private static final String ANALYZE_TABLE_TEMPLATE = " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + "${catalogId} AS catalog_id, " + "${dbId} AS db_id, " @@ -70,28 +61,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; - private static final String ANALYZE_PARTITION_TEMPLATE = " SELECT " - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " - + "${catalogId} AS catalog_id, " - + "${dbId} AS db_id, " - + "${tblId} AS tbl_id, " - + "${idxId} AS idx_id, " - + "'${colId}' AS col_id, " - + "${partId} AS part_id, " - + "COUNT(1) AS row_count, " - + "NDV(`${colName}`) AS ndv, " - + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "to_base64(MIN(`${colName}`)) AS min, " - + "to_base64(MAX(`${colName}`)) AS max, " - + "${dataSizeFunction} AS data_size, " - + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; - private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; - // cache stats for each partition, it would be inserted into column_statistics in a batch. - private final List> buf = new ArrayList<>(); - private final boolean isTableLevelTask; private final boolean isPartitionOnly; private Set partitionNames; @@ -131,25 +103,16 @@ public class HMSAnalysisTask extends BaseAnalysisTask { * Get column statistics and insert the result to __internal_schema.column_statistics */ private void getTableColumnStats() throws Exception { - if (isPartitionOnly) { - getPartitionNames(); - List partitionAnalysisSQLs = new ArrayList<>(); - for (String partId : this.partitionNames) { - partitionAnalysisSQLs.add(generateSqlForPartition(partId)); - } - execSQLs(partitionAnalysisSQLs); - } else { - if (!info.usingSqlForPartitionColumn && isPartitionColumn()) { - try { - getPartitionColumnStats(); - } catch (Exception e) { - LOG.warn("Failed to collect stats for partition col {} using metadata, " - + "fallback to normal collection", col.getName(), e); - getOrdinaryColumnStats(); - } - } else { + if (!info.usingSqlForPartitionColumn && isPartitionColumn()) { + try { + getPartitionColumnStats(); + } catch (Exception e) { + LOG.warn("Failed to collect stats for partition col {} using metadata, " + + "fallback to normal collection", col.getName(), e); getOrdinaryColumnStats(); } + } else { + getOrdinaryColumnStats(); } } @@ -182,7 +145,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { params.put("maxFunction", getMaxFunction()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); - executeInsertSql(sql); + runQuery(sql); } private void getPartitionColumnStats() throws Exception { @@ -227,7 +190,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { params.put("data_size", String.valueOf(dataSize)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE); - executeInsertSql(sql); + runQuery(sql); } private String updateMinValue(String currentMin, String value) { @@ -278,7 +241,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { partitionNames = table.getPartitionNames(); } else if (info.partitionCount > 0) { partitionNames = table.getPartitionNames().stream() - .limit(info.partitionCount).collect(Collectors.toSet()); + .limit(info.partitionCount).collect(Collectors.toSet()); } if (partitionNames == null || partitionNames.isEmpty()) { throw new RuntimeException("Not a partition table or no partition specified."); @@ -286,80 +249,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask { } } - private String generateSqlForPartition(String partId) { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_PARTITION_TEMPLATE); - String[] splits = partId.split("/"); - for (int i = 0; i < splits.length; i++) { - String[] kv = splits[i].split("="); - sb.append(kv[0]); - sb.append("='"); - sb.append(kv[1]); - sb.append("'"); - if (i < splits.length - 1) { - sb.append(" and "); - } - } - Map params = buildStatsParams(partId); - params.put("dataSizeFunction", getDataSizeFunction(col)); - return new StringSubstitutor(params).replace(sb.toString()); - } - - public void execSQLs(List partitionAnalysisSQLs) throws Exception { - long startTime = System.currentTimeMillis(); - LOG.debug("analyze task {} start at {}", info.toString(), new Date()); - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { - List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); - for (List group : sqlGroups) { - if (killed) { - return; - } - StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); - group.forEach(partitionCollectSQL::add); - stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); - buf.add(stmtExecutor.executeInternalQuery() - .stream().map(ColStatsData::new).collect(Collectors.toList())); - QueryState queryState = r.connectContext.getState(); - if (queryState.getStateType().equals(QueryState.MysqlStateType.ERR)) { - throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", - catalog.getName(), db.getFullName(), info.colName, partitionCollectSQL, - queryState.getErrorMessage())); - } - } - for (List colStatsDataList : buf) { - StringBuilder batchInsertSQL = - new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME - + " VALUES "); - StringJoiner sj = new StringJoiner(","); - colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); - batchInsertSQL.append(sj); - stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); - executeWithExceptionOnFail(stmtExecutor); - } - } finally { - LOG.debug("analyze task {} end. cost {}ms", info, System.currentTimeMillis() - startTime); - } - - } - - private void executeInsertSql(String sql) throws Exception { - long startTime = System.currentTimeMillis(); - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { - r.connectContext.getSessionVariable().disableNereidsPlannerOnce(); - this.stmtExecutor = new StmtExecutor(r.connectContext, sql); - r.connectContext.setExecutor(stmtExecutor); - this.stmtExecutor.execute(); - QueryState queryState = r.connectContext.getState(); - if (queryState.getStateType().equals(QueryState.MysqlStateType.ERR)) { - LOG.warn(String.format("Failed to analyze %s.%s.%s, sql: [%s], error: [%s]", - catalog.getName(), db.getFullName(), info.colName, sql, queryState.getErrorMessage())); - throw new RuntimeException(queryState.getErrorMessage()); - } - LOG.debug(String.format("Analyze %s.%s.%s done. SQL: [%s]. Cost %d ms.", - catalog.getName(), db.getFullName(), info.colName, sql, (System.currentTimeMillis() - startTime))); - } - } - private Map buildStatsParams(String partId) { Map commonParams = new HashMap<>(); String id = StatisticsUtil.constructId(tbl.getId(), -1); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java index 5ae66d292d..649b075c67 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java @@ -20,25 +20,17 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.external.JdbcExternalTable; import org.apache.doris.common.FeConstants; -import org.apache.doris.qe.AutoCloseConnectContext; -import org.apache.doris.qe.QueryState; -import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.commons.text.StringSubstitutor; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import java.util.HashMap; import java.util.List; import java.util.Map; public class JdbcAnalysisTask extends BaseAnalysisTask { - private static final Logger LOG = LogManager.getLogger(JdbcAnalysisTask.class); - private static final String ANALYZE_SQL_TABLE_TEMPLATE = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " + private static final String ANALYZE_SQL_TABLE_TEMPLATE = " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + "${catalogId} AS catalog_id, " + "${dbId} AS db_id, " @@ -49,8 +41,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask { + "COUNT(1) AS row_count, " + "NDV(`${colName}`) AS ndv, " + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "to_base64(MIN(`${colName}`)) AS min, " - + "to_base64(MAX(`${colName}`)) AS max, " + + "MIN(`${colName}`) AS min, " + + "MAX(`${colName}`) AS max, " + "${dataSizeFunction} AS data_size, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; @@ -117,25 +109,7 @@ public class JdbcAnalysisTask extends BaseAnalysisTask { params.put("dataSizeFunction", getDataSizeFunction(col)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); - executeInsertSql(sql); - } - - private void executeInsertSql(String sql) throws Exception { - long startTime = System.currentTimeMillis(); - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { - r.connectContext.getSessionVariable().disableNereidsPlannerOnce(); - this.stmtExecutor = new StmtExecutor(r.connectContext, sql); - r.connectContext.setExecutor(stmtExecutor); - this.stmtExecutor.execute(); - QueryState queryState = r.connectContext.getState(); - if (queryState.getStateType().equals(QueryState.MysqlStateType.ERR)) { - LOG.warn(String.format("Failed to analyze %s.%s.%s, sql: [%s], error: [%s]", - catalog.getName(), db.getFullName(), info.colName, sql, queryState.getErrorMessage())); - throw new RuntimeException(queryState.getErrorMessage()); - } - LOG.debug(String.format("Analyze %s.%s.%s done. SQL: [%s]. Cost %d ms.", - catalog.getName(), db.getFullName(), info.colName, sql, (System.currentTimeMillis() - startTime))); - } + runQuery(sql); } private Map buildTableStatsParams(String partId) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java deleted file mode 100644 index 6a43c5092f..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/MVAnalysisTask.java +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.analysis.CreateMaterializedViewStmt; -import org.apache.doris.analysis.Expr; -import org.apache.doris.analysis.FunctionCallExpr; -import org.apache.doris.analysis.PartitionNames; -import org.apache.doris.analysis.SelectListItem; -import org.apache.doris.analysis.SelectStmt; -import org.apache.doris.analysis.SlotRef; -import org.apache.doris.analysis.SqlParser; -import org.apache.doris.analysis.SqlScanner; -import org.apache.doris.analysis.TableRef; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.MaterializedIndexMeta; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.common.FeConstants; -import org.apache.doris.common.util.SqlParserUtils; -import org.apache.doris.statistics.util.StatisticsUtil; - -import com.google.common.base.Preconditions; - -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * Analysis for the materialized view, only gets constructed when the AnalyzeStmt is not set which - * columns to be analyzed. - * TODO: Supports multi-table mv - */ -public class MVAnalysisTask extends BaseAnalysisTask { - - private static final String ANALYZE_MV_PART = INSERT_PART_STATISTICS - + " FROM (${sql}) mv ${sampleExpr}"; - - private static final String ANALYZE_MV_COL = INSERT_COL_STATISTICS - + " (SELECT NDV(`${colName}`) AS ndv " - + " FROM (${sql}) mv) t2"; - - private MaterializedIndexMeta meta; - - private SelectStmt selectStmt; - - private OlapTable olapTable; - - public MVAnalysisTask(AnalysisInfo info) { - super(info); - init(); - } - - private void init() { - olapTable = (OlapTable) tbl; - meta = olapTable.getIndexMetaByIndexId(info.indexId); - Preconditions.checkState(meta != null); - String mvDef = meta.getDefineStmt().originStmt; - SqlScanner input = - new SqlScanner(new StringReader(mvDef), 0L); - SqlParser parser = new SqlParser(input); - CreateMaterializedViewStmt cmv = null; - try { - cmv = (CreateMaterializedViewStmt) SqlParserUtils.getStmt(parser, 0); - } catch (Exception e) { - throw new RuntimeException(e); - } - selectStmt = cmv.getSelectStmt(); - selectStmt.getTableRefs().get(0).getName().setDb(db.getFullName()); - } - - @Override - public void doExecute() throws Exception { - for (Column column : meta.getSchema()) { - SelectStmt selectOne = (SelectStmt) selectStmt.clone(); - TableRef tableRef = selectOne.getTableRefs().get(0); - SelectListItem selectItem = selectOne.getSelectList().getItems() - .stream() - .filter(i -> isCorrespondingToColumn(i, column)) - .findFirst() - .get(); - selectItem.setAlias(column.getName()); - Map params = new HashMap<>(); - for (String partName : tbl.getPartitionNames()) { - PartitionNames partitionName = new PartitionNames(false, - Collections.singletonList(partName)); - tableRef.setPartitionNames(partitionName); - String sql = selectOne.toSql(); - params.put("internalDB", FeConstants.INTERNAL_DB_NAME); - params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); - params.put("catalogId", String.valueOf(catalog.getId())); - params.put("dbId", String.valueOf(db.getId())); - params.put("tblId", String.valueOf(tbl.getId())); - params.put("idxId", String.valueOf(meta.getIndexId())); - String colName = column.getName(); - params.put("colId", colName); - String partId = olapTable.getPartition(partName) == null ? "NULL" : - String.valueOf(olapTable.getPartition(partName).getId()); - params.put("partId", partId); - params.put("dataSizeFunction", getDataSizeFunction(column)); - params.put("dbName", db.getFullName()); - params.put("colName", colName); - params.put("tblName", tbl.getName()); - params.put("sql", sql); - StatisticsUtil.execUpdate(ANALYZE_MV_PART, params); - } - params.remove("partId"); - params.remove("sampleExpr"); - params.put("type", column.getType().toString()); - StatisticsUtil.execUpdate(ANALYZE_MV_COL, params); - Env.getCurrentEnv().getStatisticsCache() - .refreshColStatsSync(meta.getIndexId(), meta.getIndexId(), column.getName()); - } - } - - // Based on the fact that materialized view create statement's select expr only contains basic SlotRef and - // AggregateFunction. - private boolean isCorrespondingToColumn(SelectListItem item, Column column) { - Expr expr = item.getExpr(); - if (expr instanceof SlotRef) { - SlotRef slotRef = (SlotRef) expr; - return slotRef.getColumnName().equalsIgnoreCase(column.getName()); - } - if (expr instanceof FunctionCallExpr) { - FunctionCallExpr func = (FunctionCallExpr) expr; - SlotRef slotRef = (SlotRef) func.getChild(0); - return slotRef.getColumnName().equalsIgnoreCase(column.getName()); - } - return false; - } - - @Override - protected void afterExecution() { - // DO NOTHING - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 185a582cde..b0c4b0b6c0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -22,28 +22,21 @@ import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; -import org.apache.doris.datasource.InternalCatalog; import org.apache.doris.qe.AutoCloseConnectContext; -import org.apache.doris.qe.QueryState; -import org.apache.doris.qe.QueryState.MysqlStateType; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Lists; import org.apache.commons.text.StringSubstitutor; import java.security.SecureRandom; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; -import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.StringJoiner; import java.util.stream.Collectors; /** @@ -51,29 +44,6 @@ import java.util.stream.Collectors; */ public class OlapAnalysisTask extends BaseAnalysisTask { - // TODO Currently, NDV is computed for the full table; in fact, - // NDV should only be computed for the relevant partition. - private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS - + " (SELECT NDV(`${colName}`) AS ndv " - + " FROM `${dbName}`.`${tblName}`) t2"; - - private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE = - " SELECT " - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " - + "${catalogId} AS catalog_id, " - + "${dbId} AS db_id, " - + "${tblId} AS tbl_id, " - + "${idxId} AS idx_id, " - + "'${colId}' AS col_id, " - + "${partId} AS part_id, " - + "COUNT(1) AS row_count, " - + "NDV(`${colName}`) AS ndv, " - + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " - + "${dataSizeFunction} AS data_size, " - + "NOW() FROM `${dbName}`.`${tblName}` PARTITION ${partitionName}"; - private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + "${catalogId} AS catalog_id, " @@ -92,9 +62,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask { + "FROM `${dbName}`.`${tblName}`" + "${tablets}"; - // cache stats for each partition, it would be inserted into column_statistics in a batch. - private final List> buf = new ArrayList<>(); - @VisibleForTesting public OlapAnalysisTask() { } @@ -148,45 +115,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask { stmtExecutor = new StmtExecutor(r.connectContext, stringSubstitutor.replace(SAMPLE_COLUMN_SQL_TEMPLATE)); // Scalar query only return one row ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); - OlapTable olapTable = (OlapTable) tbl; - Collection partitions = olapTable.getPartitions(); - int partitionCount = partitions.size(); - List values = partitions.stream().map(p -> String.format( - "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())", - StatisticsUtil.quote(StatisticsUtil.constructId(tbl.getId(), -1, col.getName(), p.getId())), - InternalCatalog.INTERNAL_CATALOG_ID, - db.getId(), - tbl.getId(), - -1, - StatisticsUtil.quote(col.getName()), - p.getId(), - colStatsData.count / partitionCount, - colStatsData.ndv / partitionCount, - colStatsData.nullCount / partitionCount, - StatisticsUtil.quote(colStatsData.minLit), - StatisticsUtil.quote(colStatsData.maxLit), - colStatsData.dataSizeInBytes / partitionCount)).collect(Collectors.toList()); - values.add(String.format( - "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())", - StatisticsUtil.quote(StatisticsUtil.constructId(tbl.getId(), -1, col.getName())), - InternalCatalog.INTERNAL_CATALOG_ID, - db.getId(), - tbl.getId(), - -1, - StatisticsUtil.quote(col.getName()), - "NULL", - colStatsData.count, - colStatsData.ndv, - colStatsData.nullCount, - StatisticsUtil.quote(colStatsData.minLit), - StatisticsUtil.quote(colStatsData.maxLit), - colStatsData.dataSizeInBytes)); - String insertSQL = "INSERT INTO " - + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME - + " VALUES " - + String.join(",", values); - stmtExecutor = new StmtExecutor(r.connectContext, insertSQL); - executeWithExceptionOnFail(stmtExecutor); + job.appendBuf(this, Collections.singletonList(colStatsData)); } } @@ -198,6 +127,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask { protected void doFull() throws Exception { Set partitionNames = info.colToPartitions.get(info.colName); if (partitionNames.isEmpty()) { + job.appendBuf(this, Collections.emptyList()); return; } Map params = new HashMap<>(); @@ -212,68 +142,14 @@ public class OlapAnalysisTask extends BaseAnalysisTask { params.put("dbName", db.getFullName()); params.put("colName", String.valueOf(info.colName)); params.put("tblName", String.valueOf(tbl.getName())); - List partitionAnalysisSQLs = new ArrayList<>(); - try { - tbl.readLock(); - - for (String partitionName : partitionNames) { - Partition part = tbl.getPartition(partitionName); - if (part == null) { - continue; - } - params.put("partId", String.valueOf(tbl.getPartition(partitionName).getId())); - // Avoid error when get the default partition - params.put("partitionName", "`" + partitionName + "`"); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - partitionAnalysisSQLs.add(stringSubstitutor.replace(COLLECT_PARTITION_STATS_SQL_TEMPLATE)); - } - } finally { - tbl.readUnlock(); - } - execSQLs(partitionAnalysisSQLs, params); + execSQL(params); } @VisibleForTesting - public void execSQLs(List partitionAnalysisSQLs, Map params) throws Exception { - long startTime = System.currentTimeMillis(); - LOG.debug("analyze task {} start at {}", info.toString(), new Date()); - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(info.jobType.equals(JobType.SYSTEM))) { - List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); - for (List group : sqlGroups) { - if (killed) { - return; - } - StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); - group.forEach(partitionCollectSQL::add); - stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); - buf.add(stmtExecutor.executeInternalQuery() - .stream().map(ColStatsData::new).collect(Collectors.toList())); - QueryState queryState = r.connectContext.getState(); - if (queryState.getStateType().equals(MysqlStateType.ERR)) { - throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", - catalog.getName(), db.getFullName(), info.colName, partitionCollectSQL, - queryState.getErrorMessage())); - } - } - for (List colStatsDataList : buf) { - StringBuilder batchInsertSQL = - new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME - + " VALUES "); - StringJoiner sj = new StringJoiner(","); - colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); - batchInsertSQL.append(sj.toString()); - stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); - executeWithExceptionOnFail(stmtExecutor); - } - params.put("type", col.getType().toString()); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE); - stmtExecutor = new StmtExecutor(r.connectContext, sql); - executeWithExceptionOnFail(stmtExecutor); - } finally { - LOG.debug("analyze task {} end. cost {}ms", info, - System.currentTimeMillis() - startTime); - } + public void execSQL(Map params) throws Exception { + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String collectColStats = stringSubstitutor.replace(COLLECT_COL_STATISTICS); + runQuery(collectColStats); } // Get sample tablets id and scale up scaleFactor diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index e6b8297d0c..f008c8fe30 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -78,12 +78,20 @@ public class StatisticConstants { public static final int LOAD_RETRY_TIMES = 3; - // union more relation than 512 may cause StackOverFlowException in the future. - public static final int UNION_ALL_LIMIT = 512; - public static final String FULL_AUTO_ANALYZE_START_TIME = "00:00:00"; public static final String FULL_AUTO_ANALYZE_END_TIME = "23:59:59"; + public static final int INSERT_MERGE_ITEM_COUNT = 200; + + public static final long HUGE_TABLE_DEFAULT_SAMPLE_ROWS = 4194304; + public static final long HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES = 5L * 1024 * 1024 * 1024; + + public static final long HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(12); + + public static final int TABLE_STATS_HEALTH_THRESHOLD = 60; + + public static final int ANALYZE_TIMEOUT_IN_SEC = 43200; + static { SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + FeConstants.INTERNAL_DB_NAME); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index 8f5bb605b6..7031aa1956 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -113,7 +113,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { if (!(table instanceof OlapTable || table instanceof ExternalTable)) { return true; } - if (table.getDataSize(true) < Config.huge_table_lower_bound_size_in_bytes) { + if (table.getDataSize(true) < StatisticsUtil.getHugeTableLowerBoundSizeInBytes() * 5) { return false; } TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); @@ -121,12 +121,13 @@ public class StatisticsAutoCollector extends StatisticsCollector { if (tableStats == null) { return false; } - return System.currentTimeMillis() - tableStats.updatedTime < Config.huge_table_auto_analyze_interval_in_millis; + return System.currentTimeMillis() + - tableStats.updatedTime < StatisticsUtil.getHugeTableAutoAnalyzeIntervalInMillis(); } protected void createAnalyzeJobForTbl(DatabaseIf db, List analysisInfos, TableIf table) { - AnalysisMethod analysisMethod = table.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes + AnalysisMethod analysisMethod = table.getDataSize(true) > StatisticsUtil.getHugeTableLowerBoundSizeInBytes() ? AnalysisMethod.SAMPLE : AnalysisMethod.FULL; AnalysisInfo jobInfo = new AnalysisInfoBuilder() .setJobId(Env.getCurrentEnv().getNextId()) @@ -141,7 +142,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { .setAnalysisType(AnalysisInfo.AnalysisType.FUNDAMENTALS) .setAnalysisMode(AnalysisInfo.AnalysisMode.INCREMENTAL) .setAnalysisMethod(analysisMethod) - .setSampleRows(Config.huge_table_default_sample_rows) + .setSampleRows(StatisticsUtil.getHugeTableSampleRows()) .setScheduleType(ScheduleType.AUTOMATIC) .setState(AnalysisState.PENDING) .setTaskIds(new ArrayList<>()) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java index c2f1db6bc4..638db55398 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java @@ -73,14 +73,15 @@ public abstract class StatisticsCollector extends MasterDaemon { return; } - Map analysisTaskInfos = new HashMap<>(); + Map analysisTasks = new HashMap<>(); AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); - analysisManager.createTaskForEachColumns(jobInfo, analysisTaskInfos, false); + analysisManager.createTaskForEachColumns(jobInfo, analysisTasks, false); + Env.getCurrentEnv().getAnalysisManager().constructJob(jobInfo, analysisTasks.values()); if (StatisticsUtil.isExternalTable(jobInfo.catalogId, jobInfo.dbId, jobInfo.tblId)) { - analysisManager.createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, false); + analysisManager.createTableLevelTaskForExternalTable(jobInfo, analysisTasks, false); } - Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, analysisTaskInfos); - analysisTaskInfos.values().forEach(analysisTaskExecutor::submitTask); + Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, analysisTasks); + analysisTasks.values().forEach(analysisTaskExecutor::submitTask); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java deleted file mode 100644 index f34ad0f122..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.catalog.Env; -import org.apache.doris.common.Config; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.List; -import java.util.concurrent.TimeUnit; - -public class StatisticsPeriodCollector extends StatisticsCollector { - private static final Logger LOG = LogManager.getLogger(StatisticsPeriodCollector.class); - - public StatisticsPeriodCollector() { - super("Automatic Analyzer", - TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes) / 2, - new AnalysisTaskExecutor(Config.period_analyze_simultaneously_running_task_num)); - } - - @Override - protected void collect() { - try { - AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); - List jobInfos = analysisManager.findPeriodicJobs(); - for (AnalysisInfo jobInfo : jobInfos) { - createSystemAnalysisJob(jobInfo); - } - } catch (Exception e) { - LOG.warn("Failed to periodically analyze the statistics." + e); - } - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java index 3f9b2641b7..7cd8817a1a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java @@ -19,6 +19,8 @@ package org.apache.doris.statistics; import org.apache.doris.statistics.util.StatisticsUtil; +import com.google.common.annotations.VisibleForTesting; + import java.util.StringJoiner; public class StatsId { @@ -34,6 +36,17 @@ public class StatsId { // nullable public final String partId; + @VisibleForTesting + public StatsId() { + this.id = null; + this.catalogId = -1; + this.dbId = -1; + this.tblId = -1; + this.idxId = -1; + this.colId = null; + this.partId = null; + } + public StatsId(ResultRow row) { this.id = row.get(0); this.catalogId = Long.parseLong(row.get(1)); @@ -52,7 +65,7 @@ public class StatsId { sj.add(String.valueOf(tblId)); sj.add(String.valueOf(idxId)); sj.add(StatisticsUtil.quote(colId)); - sj.add(StatisticsUtil.quote(partId)); + sj.add(partId); return sj.toString(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 169ac3e338..d7bfd82655 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -177,12 +177,14 @@ public class StatisticsUtil { sessionVariable.enablePageCache = false; sessionVariable.parallelExecInstanceNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.parallelPipelineTaskNum = Config.statistics_sql_parallel_exec_instance_num; - sessionVariable.setEnableNereidsPlanner(false); + sessionVariable.setEnableNereidsPlanner(true); + sessionVariable.setEnablePipelineEngine(false); sessionVariable.enableProfile = false; sessionVariable.enableScanRunSerial = limitScan; - sessionVariable.queryTimeoutS = Config.analyze_task_timeout_in_hours * 60 * 60; - sessionVariable.insertTimeoutS = Config.analyze_task_timeout_in_hours * 60 * 60; + sessionVariable.queryTimeoutS = StatisticsUtil.getAnalyzeTimeout(); + sessionVariable.insertTimeoutS = StatisticsUtil.getAnalyzeTimeout(); sessionVariable.enableFileCache = false; + sessionVariable.forbidUnknownColStats = false; connectContext.setEnv(Env.getCurrentEnv()); connectContext.setDatabase(FeConstants.INTERNAL_DB_NAME); connectContext.setQualifiedUser(UserIdentity.ROOT.getQualifiedUser()); @@ -808,7 +810,7 @@ public class StatisticsUtil { public static boolean inAnalyzeTime(LocalTime now) { try { - Pair range = findRangeFromGlobalSessionVar(); + Pair range = findConfigFromGlobalSessionVar(); if (range == null) { return false; } @@ -825,16 +827,16 @@ public class StatisticsUtil { } } - private static Pair findRangeFromGlobalSessionVar() { + private static Pair findConfigFromGlobalSessionVar() { try { String startTime = - findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_START_TIME) + findConfigFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_START_TIME) .fullAutoAnalyzeStartTime; // For compatibility if (StringUtils.isEmpty(startTime)) { startTime = StatisticConstants.FULL_AUTO_ANALYZE_START_TIME; } - String endTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_END_TIME) + String endTime = findConfigFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_END_TIME) .fullAutoAnalyzeEndTime; if (StringUtils.isEmpty(startTime)) { endTime = StatisticConstants.FULL_AUTO_ANALYZE_END_TIME; @@ -846,7 +848,7 @@ public class StatisticsUtil { } } - private static SessionVariable findRangeFromGlobalSessionVar(String varName) throws Exception { + protected static SessionVariable findConfigFromGlobalSessionVar(String varName) throws Exception { SessionVariable sessionVariable = VariableMgr.newSessionVariable(); VariableExpr variableExpr = new VariableExpr(varName, SetType.GLOBAL); VariableMgr.getValue(sessionVariable, variableExpr); @@ -855,10 +857,71 @@ public class StatisticsUtil { public static boolean enableAutoAnalyze() { try { - return findRangeFromGlobalSessionVar(SessionVariable.ENABLE_FULL_AUTO_ANALYZE).enableFullAutoAnalyze; + return findConfigFromGlobalSessionVar(SessionVariable.ENABLE_FULL_AUTO_ANALYZE).enableFullAutoAnalyze; } catch (Exception e) { LOG.warn("Fail to get value of enable auto analyze, return false by default", e); } return false; } + + public static int getInsertMergeCount() { + try { + return findConfigFromGlobalSessionVar(SessionVariable.STATS_INSERT_MERGE_ITEM_COUNT) + .statsInsertMergeItemCount; + } catch (Exception e) { + LOG.warn("Failed to get value of insert_merge_item_count, return default", e); + } + return StatisticConstants.INSERT_MERGE_ITEM_COUNT; + } + + public static long getHugeTableSampleRows() { + try { + return findConfigFromGlobalSessionVar(SessionVariable.HUGE_TABLE_DEFAULT_SAMPLE_ROWS) + .hugeTableDefaultSampleRows; + } catch (Exception e) { + LOG.warn("Failed to get value of huge_table_default_sample_rows, return default", e); + } + return StatisticConstants.HUGE_TABLE_DEFAULT_SAMPLE_ROWS; + } + + public static long getHugeTableLowerBoundSizeInBytes() { + try { + return findConfigFromGlobalSessionVar(SessionVariable.HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES) + .hugeTableLowerBoundSizeInBytes; + } catch (Exception e) { + LOG.warn("Failed to get value of huge_table_lower_bound_size_in_bytes, return default", e); + } + return StatisticConstants.HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES; + } + + public static long getHugeTableAutoAnalyzeIntervalInMillis() { + try { + return findConfigFromGlobalSessionVar(SessionVariable.HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS) + .hugeTableAutoAnalyzeIntervalInMillis; + } catch (Exception e) { + LOG.warn("Failed to get value of huge_table_auto_analyze_interval_in_millis, return default", e); + } + return StatisticConstants.HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS; + } + + public static long getTableStatsHealthThreshold() { + try { + return findConfigFromGlobalSessionVar(SessionVariable.TABLE_STATS_HEALTH_THRESHOLD) + .tableStatsHealthThreshold; + } catch (Exception e) { + LOG.warn("Failed to get value of table_stats_health_threshold, return default", e); + } + return StatisticConstants.TABLE_STATS_HEALTH_THRESHOLD; + } + + public static int getAnalyzeTimeout() { + try { + return findConfigFromGlobalSessionVar(SessionVariable.ANALYZE_TIMEOUT) + .analyzeTimeoutS; + } catch (Exception e) { + LOG.warn("Failed to get value of table_stats_health_threshold, return default", e); + } + return StatisticConstants.ANALYZE_TIMEOUT_IN_SEC; + } + } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java index f01485f642..d4dedd1712 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java @@ -17,25 +17,10 @@ package org.apache.doris.statistics; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Database; -import org.apache.doris.catalog.InternalSchemaInitializer; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.common.FeConstants; -import org.apache.doris.datasource.InternalCatalog; -import org.apache.doris.qe.AutoCloseConnectContext; -import org.apache.doris.qe.ConnectContext; +import org.apache.doris.catalog.Env; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; -import org.apache.doris.statistics.AnalysisInfo.AnalysisMode; -import org.apache.doris.statistics.AnalysisInfo.AnalysisType; -import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.DBObjects; import org.apache.doris.statistics.util.StatisticsUtil; -import org.apache.doris.utframe.TestWithFeService; -import com.google.common.collect.Maps; import mockit.Expectations; import mockit.Mock; import mockit.MockUp; @@ -44,136 +29,196 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.HashSet; +import java.util.concurrent.atomic.AtomicInteger; -public class AnalysisJobTest extends TestWithFeService { +public class AnalysisJobTest { - @Override - protected void runBeforeAll() throws Exception { - try { - InternalSchemaInitializer.createDB(); - createDatabase("analysis_job_test"); - connectContext.setDatabase("default_cluster:analysis_job_test"); - createTable("CREATE TABLE t1 (col1 int not null, col2 int not null, col3 int not null)\n" - + "DISTRIBUTED BY HASH(col3)\n" + "BUCKETS 1\n" - + "PROPERTIES(\n" + " \"replication_num\"=\"1\"\n" - + ");"); - } catch (Exception e) { - throw new RuntimeException(e); + // make user task has been set corresponding job + @Test + public void initTest(@Mocked AnalysisInfo jobInfo, @Mocked OlapAnalysisTask task) { + AnalysisJob analysisJob = new AnalysisJob(jobInfo, Arrays.asList(task)); + Assertions.assertSame(task.job, analysisJob); + } + + @Test + public void testAppendBufTest1(@Mocked AnalysisInfo analysisInfo, @Mocked OlapAnalysisTask olapAnalysisTask) { + AtomicInteger writeBufInvokeTimes = new AtomicInteger(); + new MockUp() { + @Mock + protected void writeBuf() { + writeBufInvokeTimes.incrementAndGet(); + } + + @Mock + public void updateTaskState(AnalysisState state, String msg) { + } + + @Mock + public void deregisterJob() { + } + }; + AnalysisJob job = new AnalysisJob(analysisInfo, Arrays.asList(olapAnalysisTask)); + job.queryingTask = new HashSet<>(); + job.queryingTask.add(olapAnalysisTask); + job.queryFinished = new HashSet<>(); + job.buf = new ArrayList<>(); + job.totalTaskCount = 20; + + // not all task finished nor cached limit exceed, shouldn't write + job.appendBuf(olapAnalysisTask, Arrays.asList(new ColStatsData())); + Assertions.assertEquals(0, writeBufInvokeTimes.get()); + } + + @Test + public void testAppendBufTest2(@Mocked AnalysisInfo analysisInfo, @Mocked OlapAnalysisTask olapAnalysisTask) { + AtomicInteger writeBufInvokeTimes = new AtomicInteger(); + AtomicInteger deregisterTimes = new AtomicInteger(); + + new MockUp() { + @Mock + protected void writeBuf() { + writeBufInvokeTimes.incrementAndGet(); + } + + @Mock + public void updateTaskState(AnalysisState state, String msg) { + } + + @Mock + public void deregisterJob() { + deregisterTimes.getAndIncrement(); + } + }; + AnalysisJob job = new AnalysisJob(analysisInfo, Arrays.asList(olapAnalysisTask)); + job.queryingTask = new HashSet<>(); + job.queryingTask.add(olapAnalysisTask); + job.queryFinished = new HashSet<>(); + job.buf = new ArrayList<>(); + job.totalTaskCount = 1; + + job.appendBuf(olapAnalysisTask, Arrays.asList(new ColStatsData())); + // all task finished, should write and deregister this job + Assertions.assertEquals(1, writeBufInvokeTimes.get()); + Assertions.assertEquals(1, deregisterTimes.get()); + } + + @Test + public void testAppendBufTest3(@Mocked AnalysisInfo analysisInfo, @Mocked OlapAnalysisTask olapAnalysisTask) { + AtomicInteger writeBufInvokeTimes = new AtomicInteger(); + + new MockUp() { + @Mock + protected void writeBuf() { + writeBufInvokeTimes.incrementAndGet(); + } + + @Mock + public void updateTaskState(AnalysisState state, String msg) { + } + + @Mock + public void deregisterJob() { + } + }; + AnalysisJob job = new AnalysisJob(analysisInfo, Arrays.asList(olapAnalysisTask)); + job.queryingTask = new HashSet<>(); + job.queryingTask.add(olapAnalysisTask); + job.queryFinished = new HashSet<>(); + job.buf = new ArrayList<>(); + ColStatsData colStatsData = new ColStatsData(); + for (int i = 0; i < StatisticsUtil.getInsertMergeCount(); i++) { + job.buf.add(colStatsData); } - FeConstants.runningUnitTest = true; + job.totalTaskCount = 100; + + job.appendBuf(olapAnalysisTask, Arrays.asList(new ColStatsData())); + // cache limit exceed, should write them + Assertions.assertEquals(1, writeBufInvokeTimes.get()); } @Test - public void testCreateAnalysisJob() throws Exception { - - new MockUp() { - + public void testUpdateTaskState( + @Mocked AnalysisInfo info, + @Mocked OlapAnalysisTask task1, + @Mocked OlapAnalysisTask task2) { + AtomicInteger updateTaskStatusInvokeTimes = new AtomicInteger(); + new MockUp() { @Mock - public AutoCloseConnectContext buildConnectContext() { - return new AutoCloseConnectContext(connectContext); - } - - @Mock - public void execUpdate(String sql) throws Exception { + public void updateTaskStatus(AnalysisInfo info, AnalysisState taskState, String message, long time) { + updateTaskStatusInvokeTimes.getAndIncrement(); } }; - - new MockUp() { + AnalysisManager analysisManager = new AnalysisManager(); + new MockUp() { @Mock - public List executeInternalQuery() { - return Collections.emptyList(); + public AnalysisManager getAnalysisManager() { + return analysisManager; } }; - - new MockUp() { - - @Mock - public ConnectContext get() { - return connectContext; - } - }; - String sql = "ANALYZE TABLE t1"; - Assertions.assertNotNull(getSqlStmtExecutor(sql)); + AnalysisJob job = new AnalysisJob(info, Collections.singletonList(task1)); + job.queryFinished = new HashSet<>(); + job.queryFinished.add(task2); + job.updateTaskState(AnalysisState.FAILED, ""); + Assertions.assertEquals(2, updateTaskStatusInvokeTimes.get()); } @Test - public void testJobExecution(@Mocked StmtExecutor stmtExecutor, @Mocked InternalCatalog catalog, @Mocked - Database database, - @Mocked OlapTable olapTable) - throws Exception { - new MockUp() { + public void testWriteBuf1(@Mocked AnalysisInfo info, + @Mocked OlapAnalysisTask task1, @Mocked OlapAnalysisTask task2) { + AnalysisJob job = new AnalysisJob(info, Collections.singletonList(task1)); + job.queryFinished = new HashSet<>(); + job.queryFinished.add(task2); + new MockUp() { + @Mock + public void updateTaskState(AnalysisState state, String msg) { + } @Mock - public Column getColumn(String name) { - return new Column("col1", PrimitiveType.INT); + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + + } + + @Mock + protected void syncLoadStats() { } }; - - new MockUp() { - - @Mock - public ConnectContext buildConnectContext() { - return connectContext; - } - - @Mock - public void execUpdate(String sql) throws Exception { - } - - @Mock - public DBObjects convertIdToObjects(long catalogId, long dbId, long tblId) { - return new DBObjects(catalog, database, olapTable); - } - }; - new MockUp() { - - @Mock - public void syncLoadColStats(long tableId, long idxId, String colName) { - } - }; - new MockUp() { - - @Mock - public void execute() throws Exception { - - } - - @Mock - public List executeInternalQuery() { - return new ArrayList<>(); - } - }; - - new MockUp() { - - @Mock - public void execSQLs(List partitionAnalysisSQLs, Map params) throws Exception {} - }; - HashMap> colToPartitions = Maps.newHashMap(); - colToPartitions.put("col1", Collections.singleton("t1")); - AnalysisInfo analysisJobInfo = new AnalysisInfoBuilder().setJobId(0).setTaskId(0) - .setCatalogId(0) - .setDBId(0) - .setTblId(0) - .setColName("col1").setJobType(JobType.MANUAL) - .setAnalysisMode(AnalysisMode.FULL) - .setAnalysisMethod(AnalysisMethod.FULL) - .setAnalysisType(AnalysisType.FUNDAMENTALS) - .setColToPartitions(colToPartitions) - .setState(AnalysisState.RUNNING) - .build(); - new OlapAnalysisTask(analysisJobInfo).doExecute(); new Expectations() { { - stmtExecutor.execute(); + job.syncLoadStats(); times = 1; } }; + job.writeBuf(); + + Assertions.assertEquals(0, job.queryFinished.size()); + } + + @Test + public void testWriteBuf2(@Mocked AnalysisInfo info, + @Mocked OlapAnalysisTask task1, @Mocked OlapAnalysisTask task2) { + new MockUp() { + @Mock + public void updateTaskState(AnalysisState state, String msg) { + } + + @Mock + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + throw new RuntimeException(); + } + + @Mock + protected void syncLoadStats() { + } + }; + AnalysisJob job = new AnalysisJob(info, Collections.singletonList(task1)); + job.buf.add(new ColStatsData()); + job.queryFinished = new HashSet<>(); + job.queryFinished.add(task2); + job.writeBuf(); + Assertions.assertEquals(1, job.queryFinished.size()); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java index a0d72fcd42..2e1b70fef2 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java @@ -24,6 +24,7 @@ import org.apache.doris.analysis.TableName; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; @@ -340,7 +341,7 @@ public class AnalysisManagerTest { }; OlapTable olapTable = new OlapTable(); TableStatsMeta stats1 = new TableStatsMeta(0, 50, new AnalysisInfoBuilder().setColName("col1").build()); - stats1.updatedRows.addAndGet(30); + stats1.updatedRows.addAndGet(50); Assertions.assertTrue(olapTable.needReAnalyzeTable(stats1)); TableStatsMeta stats2 = new TableStatsMeta(0, 190, new AnalysisInfoBuilder().setColName("col1").build()); @@ -349,4 +350,38 @@ public class AnalysisManagerTest { } + @Test + public void testRecordLimit1() { + Config.analyze_record_limit = 2; + AnalysisManager analysisManager = new AnalysisManager(); + analysisManager.replayCreateAnalysisJob(new AnalysisInfoBuilder().setJobId(1).build()); + analysisManager.replayCreateAnalysisJob(new AnalysisInfoBuilder().setJobId(2).build()); + analysisManager.replayCreateAnalysisJob(new AnalysisInfoBuilder().setJobId(3).build()); + Assertions.assertEquals(2, analysisManager.analysisJobInfoMap.size()); + Assertions.assertTrue(analysisManager.analysisJobInfoMap.containsKey(2L)); + Assertions.assertTrue(analysisManager.analysisJobInfoMap.containsKey(3L)); + } + + @Test + public void testRecordLimit2() { + Config.analyze_record_limit = 2; + AnalysisManager analysisManager = new AnalysisManager(); + analysisManager.replayCreateAnalysisTask(new AnalysisInfoBuilder().setTaskId(1).build()); + analysisManager.replayCreateAnalysisTask(new AnalysisInfoBuilder().setTaskId(2).build()); + analysisManager.replayCreateAnalysisTask(new AnalysisInfoBuilder().setTaskId(3).build()); + Assertions.assertEquals(2, analysisManager.analysisTaskInfoMap.size()); + Assertions.assertTrue(analysisManager.analysisTaskInfoMap.containsKey(2L)); + Assertions.assertTrue(analysisManager.analysisTaskInfoMap.containsKey(3L)); + } + + @Test + public void testRecordLimit3() { + Config.analyze_record_limit = 2; + AnalysisManager analysisManager = new AnalysisManager(); + analysisManager.autoJobs.offer(new AnalysisInfoBuilder().setJobId(1).build()); + analysisManager.autoJobs.offer(new AnalysisInfoBuilder().setJobId(2).build()); + analysisManager.autoJobs.offer(new AnalysisInfoBuilder().setJobId(3).build()); + Assertions.assertEquals(2, analysisManager.autoJobs.size()); + } + } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java index 19d7798041..8cfcfeabd2 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java @@ -37,6 +37,7 @@ import com.google.common.collect.Maps; import mockit.Mock; import mockit.MockUp; import mockit.Mocked; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.Collections; @@ -45,6 +46,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.atomic.AtomicBoolean; public class AnalysisTaskExecutorTest extends TestWithFeService { @@ -82,6 +84,15 @@ public class AnalysisTaskExecutorTest extends TestWithFeService { return new Column("col1", PrimitiveType.INT); } }; + final AtomicBoolean cancelled = new AtomicBoolean(); + new MockUp() { + + @Mock + public boolean cancel(String msg) { + cancelled.set(true); + return true; + } + }; AnalysisInfo analysisJobInfo = new AnalysisInfoBuilder().setJobId(0).setTaskId(0) .setCatalogId(0) .setDBId(0) @@ -98,7 +109,10 @@ public class AnalysisTaskExecutorTest extends TestWithFeService { AnalysisTaskWrapper analysisTaskWrapper = new AnalysisTaskWrapper(analysisTaskExecutor, analysisJob); Deencapsulation.setField(analysisTaskWrapper, "startTime", 5); b.put(analysisTaskWrapper); - analysisTaskExecutor.start(); + analysisTaskExecutor.tryToCancel(); + Assertions.assertTrue(cancelled.get()); + Assertions.assertTrue(b.isEmpty()); + } @Test diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java new file mode 100644 index 0000000000..268540885d --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.InternalSchemaInitializer; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.common.FeConstants; +import org.apache.doris.datasource.InternalCatalog; +import org.apache.doris.qe.AutoCloseConnectContext; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.StmtExecutor; +import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; +import org.apache.doris.statistics.AnalysisInfo.AnalysisMode; +import org.apache.doris.statistics.AnalysisInfo.AnalysisType; +import org.apache.doris.statistics.AnalysisInfo.JobType; +import org.apache.doris.statistics.util.DBObjects; +import org.apache.doris.statistics.util.StatisticsUtil; +import org.apache.doris.utframe.TestWithFeService; + +import com.google.common.collect.Maps; +import mockit.Expectations; +import mockit.Mock; +import mockit.MockUp; +import mockit.Mocked; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class AnalyzeTest extends TestWithFeService { + + @Override + protected void runBeforeAll() throws Exception { + try { + InternalSchemaInitializer.createDB(); + createDatabase("analysis_job_test"); + connectContext.setDatabase("default_cluster:analysis_job_test"); + createTable("CREATE TABLE t1 (col1 int not null, col2 int not null, col3 int not null)\n" + + "DISTRIBUTED BY HASH(col3)\n" + "BUCKETS 1\n" + + "PROPERTIES(\n" + " \"replication_num\"=\"1\"\n" + + ");"); + } catch (Exception e) { + throw new RuntimeException(e); + } + FeConstants.runningUnitTest = true; + } + + @Test + public void testCreateAnalysisJob() throws Exception { + + new MockUp() { + + @Mock + public AutoCloseConnectContext buildConnectContext() { + return new AutoCloseConnectContext(connectContext); + } + + @Mock + public void execUpdate(String sql) throws Exception { + } + }; + + new MockUp() { + @Mock + public List executeInternalQuery() { + return Collections.emptyList(); + } + }; + + new MockUp() { + + @Mock + public ConnectContext get() { + return connectContext; + } + }; + String sql = "ANALYZE TABLE t1"; + Assertions.assertNotNull(getSqlStmtExecutor(sql)); + } + + @Test + public void testJobExecution(@Mocked StmtExecutor stmtExecutor, @Mocked InternalCatalog catalog, @Mocked + Database database, + @Mocked OlapTable olapTable) + throws Exception { + new MockUp() { + + @Mock + public Column getColumn(String name) { + return new Column("col1", PrimitiveType.INT); + } + }; + + new MockUp() { + + @Mock + public ConnectContext buildConnectContext() { + return connectContext; + } + + @Mock + public void execUpdate(String sql) throws Exception { + } + + @Mock + public DBObjects convertIdToObjects(long catalogId, long dbId, long tblId) { + return new DBObjects(catalog, database, olapTable); + } + }; + new MockUp() { + + @Mock + public void syncLoadColStats(long tableId, long idxId, String colName) { + } + }; + new MockUp() { + + @Mock + public void execute() throws Exception { + + } + + @Mock + public List executeInternalQuery() { + return new ArrayList<>(); + } + }; + + new MockUp() { + + @Mock + public void execSQLs(List partitionAnalysisSQLs, Map params) throws Exception {} + }; + + new MockUp() { + + @Mock + protected void runQuery(String sql) {} + }; + HashMap> colToPartitions = Maps.newHashMap(); + colToPartitions.put("col1", Collections.singleton("t1")); + AnalysisInfo analysisJobInfo = new AnalysisInfoBuilder().setJobId(0).setTaskId(0) + .setCatalogId(0) + .setDBId(0) + .setTblId(0) + .setColName("col1").setJobType(JobType.MANUAL) + .setAnalysisMode(AnalysisMode.FULL) + .setAnalysisMethod(AnalysisMethod.FULL) + .setAnalysisType(AnalysisType.FUNDAMENTALS) + .setColToPartitions(colToPartitions) + .setState(AnalysisState.RUNNING) + .build(); + new OlapAnalysisTask(analysisJobInfo).doExecute(); + new Expectations() { + { + stmtExecutor.execute(); + times = 1; + } + }; + } + +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java index cb0364682c..95ed5023e3 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java @@ -23,6 +23,7 @@ import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.external.HMSExternalDatabase; import org.apache.doris.catalog.external.HMSExternalTable; +import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.datasource.CatalogMgr; import org.apache.doris.datasource.HMSExternalCatalog; import org.apache.doris.ha.FrontendNodeType; @@ -31,6 +32,9 @@ import org.apache.doris.system.Frontend; import org.apache.doris.thrift.TUpdateFollowerStatsCacheRequest; import org.apache.doris.utframe.TestWithFeService; +import com.github.benmanes.caffeine.cache.AsyncCacheLoader; +import com.github.benmanes.caffeine.cache.AsyncLoadingCache; +import com.github.benmanes.caffeine.cache.Caffeine; import com.google.common.collect.Lists; import com.google.gson.JsonArray; import com.google.gson.JsonElement; @@ -40,9 +44,11 @@ import mockit.Expectations; import mockit.Mock; import mockit.MockUp; import mockit.Mocked; +import org.checkerframework.checker.nullness.qual.NonNull; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -50,6 +56,7 @@ import java.util.Date; import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; +import java.util.concurrent.ThreadPoolExecutor; public class CacheTest extends TestWithFeService { @@ -199,10 +206,10 @@ public class CacheTest extends TestWithFeService { @Test public void testLoadFromMeta(@Mocked Env env, - @Mocked CatalogMgr mgr, - @Mocked HMSExternalCatalog catalog, - @Mocked HMSExternalDatabase db, - @Mocked HMSExternalTable table) throws Exception { + @Mocked CatalogMgr mgr, + @Mocked HMSExternalCatalog catalog, + @Mocked HMSExternalDatabase db, + @Mocked HMSExternalTable table) throws Exception { new MockUp() { @Mock @@ -350,4 +357,29 @@ public class CacheTest extends TestWithFeService { } }; } + + @Test + public void testEvict() { + ThreadPoolExecutor threadPool + = ThreadPoolManager.newDaemonFixedThreadPool( + 1, Integer.MAX_VALUE, "STATS_FETCH", true); + AsyncLoadingCache columnStatisticsCache = + Caffeine.newBuilder() + .maximumSize(1) + .refreshAfterWrite(Duration.ofHours(StatisticConstants.STATISTICS_CACHE_REFRESH_INTERVAL)) + .executor(threadPool) + .buildAsync(new AsyncCacheLoader() { + @Override + public @NonNull CompletableFuture asyncLoad(@NonNull Integer integer, + @NonNull Executor executor) { + return CompletableFuture.supplyAsync(() -> { + return integer; + }, threadPool); + } + }); + columnStatisticsCache.get(1); + columnStatisticsCache.get(2); + Assertions.assertTrue(columnStatisticsCache.synchronous().asMap().containsKey(2)); + Assertions.assertEquals(1, columnStatisticsCache.synchronous().asMap().size()); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index d618a5fa53..f2b9f84f0d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -19,47 +19,36 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.TableSample; import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.TableIf; -import org.apache.doris.common.Config; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; +import org.apache.doris.statistics.AnalysisInfo.JobType; +import org.apache.doris.statistics.util.StatisticsUtil; -import mockit.Expectations; +import mockit.Mock; +import mockit.MockUp; import mockit.Mocked; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class OlapAnalysisTaskTest { + // test manual @Test - public void testAutoSample(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked TableIf tableIf) { - new Expectations() { - { - tableIf.getDataSize(true); - result = 60_0000_0000L; - } - }; + public void testSample1(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked TableIf tableIf) { AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder() .setAnalysisMethod(AnalysisMethod.FULL); + analysisInfoBuilder.setJobType(JobType.MANUAL); OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); olapAnalysisTask.info = analysisInfoBuilder.build(); olapAnalysisTask.tbl = tableIf; - Config.enable_auto_sample = true; TableSample tableSample = olapAnalysisTask.getTableSample(); - Assertions.assertEquals(4194304, tableSample.getSampleValue()); - Assertions.assertFalse(tableSample.isPercent()); - - new Expectations() { - { - tableIf.getDataSize(true); - result = 1_0000_0000L; - } - }; - tableSample = olapAnalysisTask.getTableSample(); Assertions.assertNull(tableSample); analysisInfoBuilder.setSampleRows(10); + analysisInfoBuilder.setJobType(JobType.MANUAL); analysisInfoBuilder.setAnalysisMethod(AnalysisMethod.SAMPLE); olapAnalysisTask.info = analysisInfoBuilder.build(); tableSample = olapAnalysisTask.getTableSample(); @@ -67,4 +56,49 @@ public class OlapAnalysisTaskTest { Assertions.assertFalse(tableSample.isPercent()); } + // test auto big table + @Test + public void testSample2(@Mocked OlapTable tbl) { + new MockUp() { + + @Mock + public long getDataSize(boolean singleReplica) { + return 1000_0000_0000L; + } + }; + + AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder() + .setAnalysisMethod(AnalysisMethod.FULL); + analysisInfoBuilder.setJobType(JobType.SYSTEM); + OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); + olapAnalysisTask.info = analysisInfoBuilder.build(); + olapAnalysisTask.tbl = tbl; + TableSample tableSample = olapAnalysisTask.getTableSample(); + Assertions.assertNotNull(tableSample); + Assertions.assertEquals(StatisticsUtil.getHugeTableSampleRows(), tableSample.getSampleValue()); + + } + + // test auto small table + @Test + public void testSample3(@Mocked OlapTable tbl) { + new MockUp() { + + @Mock + public long getDataSize(boolean singleReplica) { + return 1000; + } + }; + + AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder() + .setAnalysisMethod(AnalysisMethod.FULL); + analysisInfoBuilder.setJobType(JobType.SYSTEM); + OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); + olapAnalysisTask.info = analysisInfoBuilder.build(); + olapAnalysisTask.tbl = tbl; + TableSample tableSample = olapAnalysisTask.getTableSample(); + Assertions.assertNull(tableSample); + + } + } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java index 5ddd207bab..d441ce5b09 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java @@ -27,6 +27,7 @@ import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.View; import org.apache.doris.cluster.ClusterNamespace; +import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.FeConstants; import org.apache.doris.datasource.CatalogIf; @@ -40,10 +41,12 @@ import mockit.Expectations; import mockit.Injectable; import mockit.Mock; import mockit.MockUp; +import mockit.Mocked; import org.apache.hadoop.util.Lists; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.time.LocalTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -52,6 +55,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; public class StatisticsAutoCollectorTest { @@ -213,4 +217,73 @@ public class StatisticsAutoCollectorTest { // Assertions.assertNull(statisticsAutoCollector.getReAnalyzeRequiredPart(analysisInfo2)); Assertions.assertNotNull(statisticsAutoCollector.getReAnalyzeRequiredPart(analysisInfo2)); } + + @Test + public void testLoop() { + AtomicBoolean timeChecked = new AtomicBoolean(); + AtomicBoolean switchChecked = new AtomicBoolean(); + new MockUp() { + + @Mock + public boolean inAnalyzeTime(LocalTime now) { + timeChecked.set(true); + return true; + } + + @Mock + public boolean enableAutoAnalyze() { + switchChecked.set(true); + return true; + } + }; + StatisticsAutoCollector autoCollector = new StatisticsAutoCollector(); + autoCollector.collect(); + Assertions.assertTrue(timeChecked.get() && switchChecked.get()); + + } + + @Test + public void checkAvailableThread() { + StatisticsAutoCollector autoCollector = new StatisticsAutoCollector(); + Assertions.assertEquals(Config.full_auto_analyze_simultaneously_running_task_num, + autoCollector.analysisTaskExecutor.executors.getMaximumPoolSize()); + } + + @Test + public void testSkip(@Mocked OlapTable olapTable, @Mocked TableStatsMeta stats, @Mocked TableIf anyOtherTable) { + new MockUp() { + + @Mock + public long getDataSize(boolean singleReplica) { + return StatisticsUtil.getHugeTableLowerBoundSizeInBytes() * 5 + 1000000000; + } + }; + + new MockUp() { + + @Mock + public TableStatsMeta findTableStatsStatus(long tblId) { + return stats; + } + }; + // A very huge table has been updated recently, so we should skip it this time + stats.updatedTime = System.currentTimeMillis() - 1000; + StatisticsAutoCollector autoCollector = new StatisticsAutoCollector(); + Assertions.assertTrue(autoCollector.skip(olapTable)); + // The update of this huge table is long time ago, so we shouldn't skip it this time + stats.updatedTime = System.currentTimeMillis() + - StatisticsUtil.getHugeTableAutoAnalyzeIntervalInMillis() - 10000; + Assertions.assertFalse(autoCollector.skip(olapTable)); + new MockUp() { + + @Mock + public TableStatsMeta findTableStatsStatus(long tblId) { + return null; + } + }; + // can't find table stats meta, which means this table never get analyzed, so we shouldn't skip it this time + Assertions.assertFalse(autoCollector.skip(olapTable)); + // this is not olap table nor external table, so we should skip it this time + Assertions.assertTrue(autoCollector.skip(anyOtherTable)); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java index c0d4a656d7..c0c790c9c2 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java @@ -19,9 +19,15 @@ package org.apache.doris.statistics.util; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; +import org.apache.doris.qe.SessionVariable; -import org.junit.Test; +import mockit.Mock; +import mockit.MockUp; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.time.LocalTime; +import java.time.format.DateTimeFormatter; public class StatisticsUtilTest { @Test @@ -67,4 +73,42 @@ public class StatisticsUtilTest { Assertions.fail(); } } + + @Test + public void testInAnalyzeTime1() { + new MockUp() { + + @Mock + protected SessionVariable findConfigFromGlobalSessionVar(String varName) throws Exception { + SessionVariable sessionVariable = new SessionVariable(); + sessionVariable.fullAutoAnalyzeStartTime = "00:00:00"; + sessionVariable.fullAutoAnalyzeEndTime = "02:00:00"; + return sessionVariable; + } + }; + DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); + String now = "01:00:00"; + Assertions.assertTrue(StatisticsUtil.inAnalyzeTime(LocalTime.parse(now, timeFormatter))); + now = "13:00:00"; + Assertions.assertFalse(StatisticsUtil.inAnalyzeTime(LocalTime.parse(now, timeFormatter))); + } + + @Test + public void testInAnalyzeTime2() { + new MockUp() { + + @Mock + protected SessionVariable findConfigFromGlobalSessionVar(String varName) throws Exception { + SessionVariable sessionVariable = new SessionVariable(); + sessionVariable.fullAutoAnalyzeStartTime = "00:00:00"; + sessionVariable.fullAutoAnalyzeEndTime = "23:00:00"; + return sessionVariable; + } + }; + DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); + String now = "15:00:00"; + Assertions.assertTrue(StatisticsUtil.inAnalyzeTime(LocalTime.parse(now, timeFormatter))); + now = "23:30:00"; + Assertions.assertFalse(StatisticsUtil.inAnalyzeTime(LocalTime.parse(now, timeFormatter))); + } } diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 4e4c4a0842..9269255919 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -57,7 +57,11 @@ suite("test_analyze") { `analyzetestlimitedk8` double null comment "", `analyzetestlimitedk9` float null comment "", `analyzetestlimitedk12` string null comment "", - `analyzetestlimitedk13` largeint(40) null comment "" + `analyzetestlimitedk13` largeint(40) null comment "", + `analyzetestlimitedk14` ARRAY NULL COMMENT "", + `analyzetestlimitedk15` Map NULL COMMENT "", + `analyzetestlimitedk16` STRUCT NULL, + `analyzetestlimitedk17` JSON NULL ) engine=olap DUPLICATE KEY(`analyzetestlimitedk3`) DISTRIBUTED BY HASH(`analyzetestlimitedk3`) BUCKETS 5 properties("replication_num" = "1") @@ -67,26 +71,39 @@ suite("test_analyze") { INSERT INTO `${tbl}` VALUES (-2103297891,1,101,15248,4761818404925265645,939926.283, 'UTmCFKMbprf0zSVOIlBJRNOl3JcNBdOsnCDt','2022-09-28','2022-10-28 01:56:56','tVvGDSrN6kyn', -954349107.187117,-40.46286,'g1ZP9nqVgaGKya3kPERdBofTWJQ4TIJEz972Xvw4hfPpTpWwlmondiLVTCyld7rSBlSWrE7NJRB0pvPGEFQKOx1s3', - '-1559301292834325905'), + '-1559301292834325905', NULL, NULL, NULL, NULL), (-2094982029,0,-81,-14746,-2618177187906633064,121889.100,NULL,'2023-05-01','2022-11-25 00:24:12', '36jVI0phYfhFucAOEASbh4OdvUYcI7QZFgQSveNyfGcRRUtQG9HGN1UcCmUH',-82250254.174239,NULL, - 'bTUHnMC4v7dI8U3TK0z4wZHdytjfHQfF1xKdYAVwPVNMT4fT4F92hj8ENQXmCkWtfp','6971810221218612372'), + 'bTUHnMC4v7dI8U3TK0z4wZHdytjfHQfF1xKdYAVwPVNMT4fT4F92hj8ENQXmCkWtfp','6971810221218612372', NULL, NULL, NULL, NULL), (-1840301109,1,NULL,NULL,7805768460922079440,546556.220,'wC7Pif9SJrg9b0wicGfPz2ezEmEKotmN6AMI',NULL, '2023-05-20 18:13:14','NM5SLu62SGeuD',-1555800813.9748349,-11122.953, - 'NH97wIjXk7dspvvfUUKe41ZetUnDmqLxGg8UYXwOwK3Jlu7dxO2GE9UJjyKW0NBxqUk1DY','-5004534044262380098'), + 'NH97wIjXk7dspvvfUUKe41ZetUnDmqLxGg8UYXwOwK3Jlu7dxO2GE9UJjyKW0NBxqUk1DY','-5004534044262380098', NULL, NULL, NULL, NULL), (-1819679967,0,10,NULL,-5772413527188525359,-532045.626,'kqMe4VYEZAmajLthCLRkl8StDQHKrDWz91AQ','2022-06-30', '2023-02-22 15:30:38','wAbeF3p84j5pFJTInQuKZOezFbsy8HIjmuUF',-1766437367.4377379,1791.4128, - '6OWmBD04UeKt1xI2XnR8t1kPG7qEYrf4J8RkA8UMs4HF33Yl','-8433424551792664598'), + '6OWmBD04UeKt1xI2XnR8t1kPG7qEYrf4J8RkA8UMs4HF33Yl','-8433424551792664598', NULL, NULL, NULL, NULL), (-1490846276,0,NULL,7744,6074522476276146996,594200.976,NULL,'2022-11-27','2023-03-11 21:28:44', 'yr8AuJLr2ud7DIwlt06cC7711UOsKslcDyySuqqfQE5X7Vjic6azHOrM6W',-715849856.288922,3762.217, - '4UpWZJ0Twrefw0Tm0AxFS38V5','7406302706201801560'),(-1465848366,1,72,29170,-5585523608136628843,-34210.874, + '4UpWZJ0Twrefw0Tm0AxFS38V5','7406302706201801560', NULL, NULL, NULL, NULL),(-1465848366,1,72,29170,-5585523608136628843,-34210.874, 'rMGygAWU91Wa3b5A7l1wheo6EF0o6zhw4YeE','2022-09-20','2023-06-11 18:17:16','B6m9S9O2amsa4SXrEKK0ivJ2x9m1u8av', - 862085772.298349,-22304.209,'1','-3399178642401166400'),(-394034614,1,65,5393,-200651968801088119,NULL, + 862085772.298349,-22304.209,'1','-3399178642401166400', NULL, NULL, NULL, NULL),(-394034614,1,65,5393,-200651968801088119,NULL, '9MapWX9pn8zes9Gey1lhRsH3ATyQPIysjQYi','2023-05-11','2022-07-02 02:56:53','z5VWbuKr6HiK7yC7MRIoQGrb98VUS', - 1877828963.091433,-1204.1926,'fSDQqT38rkrJEi6fwc90rivgQcRPaW5V1aEmZpdSvUm','8882970420609470903'), + 1877828963.091433,-1204.1926,'fSDQqT38rkrJEi6fwc90rivgQcRPaW5V1aEmZpdSvUm','8882970420609470903', NULL, NULL, NULL, NULL), (-287465855,0,-10,-32484,-5161845307234178602,748718.592,'n64TXbG25DQL5aw5oo9o9cowSjHCXry9HkId','2023-01-02', '2022-11-17 14:58:52','d523m4PwLdHZtPTqSoOBo5IGivCKe4A1Sc8SKCILFxgzYLe0',NULL,27979.855, - 'ps7qwcZjBjkGfcXYMw5HQMwnElzoHqinwk8vhQCbVoGBgfotc4oSkpD3tP34h4h0tTogDMwFu60iJm1bofUzyUQofTeRwZk8','4692206687866847780') + 'ps7qwcZjBjkGfcXYMw5HQMwnElzoHqinwk8vhQCbVoGBgfotc4oSkpD3tP34h4h0tTogDMwFu60iJm1bofUzyUQofTeRwZk8','4692206687866847780', NULL, NULL, NULL, NULL) + """ + + sql """ + SET enable_nereids_planner=true; + + """ + + sql """ + SET forbid_unknown_col_stats=false; + """ + + sql """ + SELECT * FROM ${tbl} """ sql """ @@ -97,10 +114,6 @@ suite("test_analyze") { ANALYZE DATABASE ${db} WITH SYNC """ - sql """ - SET enable_nereids_planner=true; - - """ sql """ SET enable_fallback_to_original_planner=false; """ @@ -152,19 +165,19 @@ suite("test_analyze") { exception = e } - a_result_1 = sql """ + def a_result_1 = sql """ ANALYZE DATABASE ${db} WITH SYNC WITH SAMPLE PERCENT 10 """ - a_result_2 = sql """ + def a_result_2 = sql """ ANALYZE DATABASE ${db} WITH SYNC WITH SAMPLE PERCENT 5 """ - a_result_3 = sql """ + def a_result_3 = sql """ ANALYZE DATABASE ${db} WITH SAMPLE PERCENT 5 """ - show_result = sql """ + def show_result = sql """ SHOW ANALYZE """ @@ -891,8 +904,24 @@ PARTITION `p599` VALUES IN (599) } assert expected_col_stats(id_col_stats, 600, 1) - assert expected_col_stats(id_col_stats, 599, 7) + assert (int) Double.parseDouble(id_col_stats[0][2]) < 700 + && (int) Double.parseDouble(id_col_stats[0][2]) > 500 + assert expected_col_stats(id_col_stats, 0, 3) + assert expected_col_stats(id_col_stats, 2400, 4) + assert expected_col_stats(id_col_stats, 4, 5) assert expected_col_stats(id_col_stats, 0, 6) + assert expected_col_stats(id_col_stats, 599, 7) + + def update_time = id_col_stats[0][8] + + sql """ANALYZE TABLE test_600_partition_table_analyze WITH SYNC""" + + // Data has no change, update time shouldn't be update since this table don't need to analyze again + id_col_stats_2 = sql """ + SHOW COLUMN CACHED STATS test_600_partition_table_analyze(id); + """ + + assert update_time == id_col_stats_2[0][8] sql """DROP TABLE IF EXISTS increment_analyze_test""" sql """ @@ -1151,4 +1180,39 @@ PARTITION `p599` VALUES IN (599) return (r[0][7]).equals(expected_value) } expected_max(max, "测试") + + show_result = sql """ + SHOW ANALYZE ${tbl} + """ + + def tbl_name_as_expetected = { r,name -> + for (int i = 0; i < r.size; i++) { + if (r[i][3] != name) { + return false + } + } + return true + } + + assert show_result[0][9] == "FINISHED" + assert tbl_name_as_expetected(show_result, "${tbl}") + + show_result = sql """ + SHOW ANALYZE ${tbl} WHERE STATE = "FINISHED" + """ + + assert show_result.size() > 0 + + def all_finished = { r -> + for (int i = 0; i < r.size; i++) { + if (r[i][9] != "FINISHED") { + return false + } + } + return true + } + + assert all_finished(show_result) + + } diff --git a/regression-test/suites/statistics/test_agg_complex_type.groovy b/regression-test/suites/statistics/test_agg_complex_type.groovy new file mode 100644 index 0000000000..55af87f35b --- /dev/null +++ b/regression-test/suites/statistics/test_agg_complex_type.groovy @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_analyze_with_agg_complex_type") { + sql """drop table if exists test_agg_complex_type;""" + + sql """create table test_agg_complex_type ( + datekey int, + device_id bitmap BITMAP_UNION NULL, + hll_test hll hll_union, + qs QUANTILE_STATE QUANTILE_UNION + ) + aggregate key (datekey) + distributed by hash(datekey) buckets 1 + properties( + "replication_num" = "1" + );""" + + sql """insert into test_agg_complex_type values (1,to_bitmap(1), hll_hash("11"), TO_QUANTILE_STATE("11", 1.0));""" + + sql """insert into test_agg_complex_type values (2, to_bitmap(1), hll_hash("12"), TO_QUANTILE_STATE("11", 1.0));""" + + sql """ANALYZE TABLE test_agg_complex_type WITH SYNC""" + + def show_result = sql """SHOW COLUMN CACHED STATS test_agg_complex_type""" + + assert show_result.size() == 1 + + def expected_col_stats = { r, expected_value, idx -> + return (int) Double.parseDouble(r[0][idx]) == expected_value + } + + assert expected_col_stats(show_result, 2, 1) + assert expected_col_stats(show_result, 0, 3) + assert expected_col_stats(show_result, 8, 4) + assert expected_col_stats(show_result, 4, 5) + assert expected_col_stats(show_result, 1, 6) + assert expected_col_stats(show_result, 2, 7) +} \ No newline at end of file