diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index f22e15b8f7..127240626e 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -237,16 +237,16 @@ Automatic collection tasks do not support viewing the completion status and fail |full_auto_analyze_end_time|End time for automatic statistics collection|02:00:00| |enable_auto_sample|Enable automatic sampling for large tables. When enabled, statistics will be automatically collected through sampling for tables larger than the `huge_table_lower_bound_size_in_bytes` threshold.| false| |auto_analyze_job_record_count|Controls the persistence of records for automatically triggered statistics collection jobs.|20000| -|huge_table_default_sample_rows|Defines the number of sample rows for large tables when automatic sampling is enabled.|200000| +|huge_table_default_sample_rows|Defines the number of sample rows for large tables when automatic sampling is enabled.|4194304| |huge_table_lower_bound_size_in_bytes|Defines the lower size threshold for large tables. When `enable_auto_sample` is enabled, statistics will be automatically collected through sampling for tables larger than this value.|5368 709120| |huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Within this interval, tables larger than `huge_table_lower_bound_size_in_bytes` will only be analyzed once.|43200000| |table_stats_health_threshold|Takes a value between 0-100. When the data update volume reaches (100 - table_stats_health_threshold)% since the last statistics collection operation, the statistics for the table are considered outdated.|80| -|enable_full_auto_analyze|Enable automatic collection functionality|true| |Session Variable|Description|Default Value| |---|---|---| |full_auto_analyze_start_time|Start time for automatic statistics collection|00:00:00| |full_auto_analyze_end_time|End time for automatic statistics collection|02:00:00| +|enable_full_auto_analyze|Enable automatic collection functionality|true| Please note that when both FE configuration and global session variables are configured for the same parameter, the value of the global session variable takes precedence. diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index 0fbbf4c979..9bd69f9448 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -249,16 +249,16 @@ SHOW AUTO ANALYZE [ptable_name] |full_auto_analyze_end_time|自动统计信息收集结束时间|02:00:00| |enable_auto_sample|是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集| false| |auto_analyze_job_record_count|控制统计信息的自动触发作业执行记录的持久化行数|20000| -|huge_table_default_sample_rows|定义开启开启大表自动sample后,对大表的采样行数|200000| +|huge_table_default_sample_rows|定义开启开启大表自动sample后,对大表的采样行数|4194304| |huge_table_lower_bound_size_in_bytes|定义大表的大小下界,在开启enable_auto_sample的情况下,大小超过该值的表将会自动通过采样收集统计信息|5368 709120| |huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次|43200000| |table_stats_health_threshold|取值在0-100之间,当自上次统计信息收集操作之后,数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时|80| -|enable_full_auto_analyze|开启自动收集功能|true| |会话变量|说明|默认值| |---|---|---| |full_auto_analyze_start_time|自动统计信息收集开始时间|00:00:00| |full_auto_analyze_end_time|自动统计信息收集结束时间|02:00:00| +|enable_full_auto_analyze|开启自动收集功能|true| 注意,对于fe配置和全局会话变量中均可配置的参数都设置的情况下,优先使用全局会话变量参数值。 diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 92fd50ede1..09b9e4abef 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2078,15 +2078,6 @@ public class Config extends ConfigBase { "Sample size for hive row count estimation."}) public static int hive_stats_partition_sample_size = 3000; - @ConfField - public static boolean enable_full_auto_analyze = true; - - @ConfField - public static String full_auto_analyze_start_time = "00:00:00"; - - @ConfField - public static String full_auto_analyze_end_time = "02:00:00"; - @ConfField public static int statistics_sql_parallel_exec_instance_num = 1; @@ -2183,10 +2174,10 @@ public class Config extends ConfigBase { + "statistics through sampling"}) public static long huge_table_lower_bound_size_in_bytes = 5L * 1024 * 1024 * 1024; - @ConfField(description = {"定义开启开启大表自动sample后,对大表的采样行数", - "This defines the number of sample rows for large tables when automatic sampling for" + @ConfField(description = {"定义开启开启大表自动sample后,对大表的采样比例", + "This defines the number of sample percent for large tables when automatic sampling for" + "large tables is enabled"}) - public static int huge_table_default_sample_rows = 20_0000; + public static int huge_table_default_sample_rows = 4194304; @ConfField(description = {"是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集" + "统计信息", "Whether to enable automatic sampling for large tables, which, when enabled, automatically" diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index d3055f2cbe..a7f7f7cd13 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1566,14 +1566,18 @@ public class OlapTable extends Table { return oldPartition; } - public long getDataSize() { + public long getDataSize(boolean singleReplica) { long dataSize = 0; for (Partition partition : getAllPartitions()) { - dataSize += partition.getDataSize(false); + dataSize += partition.getDataSize(singleReplica); } return dataSize; } + public long getDataSize() { + return getDataSize(false); + } + public long getRemoteDataSize() { long remoteDataSize = 0; for (Partition partition : getAllPartitions()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index d3e5beabf2..5c9ea40d3f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -246,7 +246,7 @@ public interface TableIf { return -1L; } - default long getDataSize() { + default long getDataSize(boolean singleReplica) { // TODO: Each tableIf should impl it by itself. return 0; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 2df47f1e9a..9d52d12efc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -414,6 +414,8 @@ public class SessionVariable implements Serializable, Writable { public static final String TEST_QUERY_CACHE_HIT = "test_query_cache_hit"; + public static final String ENABLE_FULL_AUTO_ANALYZE = "enable_full_auto_analyze"; + public static final List DEBUG_VARIABLES = ImmutableList.of( SKIP_DELETE_PREDICATE, SKIP_DELETE_BITMAP, @@ -1199,13 +1201,13 @@ public class SessionVariable implements Serializable, Writable { description = {"该参数定义自动ANALYZE例程的开始时间", "This parameter defines the start time for the automatic ANALYZE routine."}, flag = VariableMgr.GLOBAL) - public String fullAutoAnalyzeStartTime = ""; + public String fullAutoAnalyzeStartTime = "00:00:00"; @VariableMgr.VarAttr(name = FULL_AUTO_ANALYZE_END_TIME, needForward = true, checker = "checkAnalyzeTimeFormat", description = {"该参数定义自动ANALYZE例程的结束时间", "This parameter defines the end time for the automatic ANALYZE routine."}, flag = VariableMgr.GLOBAL) - public String fullAutoAnalyzeEndTime = ""; + public String fullAutoAnalyzeEndTime = "02:00:00"; @VariableMgr.VarAttr(name = ENABLE_UNIQUE_KEY_PARTIAL_UPDATE, needForward = true) public boolean enableUniqueKeyPartialUpdate = false; @@ -1217,6 +1219,11 @@ public class SessionVariable implements Serializable, Writable { options = {"none", "sql_cache", "partition_cache"}) public String testQueryCacheHit = "none"; + @VariableMgr.VarAttr(name = ENABLE_FULL_AUTO_ANALYZE, + description = {"该参数控制是否开启自动收集", "Set false to disable auto analyze"}, + flag = VariableMgr.GLOBAL) + public boolean enableFullAutoAnalyze = true; + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. public void initFuzzyModeVariables() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java index 800c446576..9aa3d85992 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java @@ -56,7 +56,7 @@ public class AnalysisTaskWrapper extends FutureTask { if (task.killed) { return; } - if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.checkAnalyzeTime( + if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.inAnalyzeTime( LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { // TODO: Do we need a separate AnalysisState here? Env.getCurrentEnv().getAnalysisManager() diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index a563e6a6fe..fd99c97e83 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -228,7 +228,8 @@ public abstract class BaseAnalysisTask { } int sampleRows = info.sampleRows; if (info.analysisMethod == AnalysisMethod.FULL) { - if (Config.enable_auto_sample && tbl.getDataSize() > Config.huge_table_lower_bound_size_in_bytes) { + if (Config.enable_auto_sample + && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { sampleRows = Config.huge_table_default_sample_rows; } else { return ""; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index 4bb7030753..e6b8297d0c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -81,6 +81,9 @@ public class StatisticConstants { // union more relation than 512 may cause StackOverFlowException in the future. public static final int UNION_ALL_LIMIT = 512; + public static final String FULL_AUTO_ANALYZE_START_TIME = "00:00:00"; + public static final String FULL_AUTO_ANALYZE_END_TIME = "23:59:59"; + static { SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + FeConstants.INTERNAL_DB_NAME); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index 6bfbf61dd4..399861a8b2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -58,11 +58,11 @@ public class StatisticsAutoCollector extends StatisticsCollector { @Override protected void collect() { - if (!StatisticsUtil.checkAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { + if (!StatisticsUtil.inAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { analysisTaskExecutor.clear(); return; } - if (Config.enable_full_auto_analyze) { + if (StatisticsUtil.enableAutoAnalyze()) { analyzeAll(); } } @@ -115,7 +115,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { if (!(table instanceof OlapTable || table instanceof ExternalTable)) { return true; } - if (table.getDataSize() < Config.huge_table_lower_bound_size_in_bytes) { + if (table.getDataSize(true) < Config.huge_table_lower_bound_size_in_bytes) { return false; } TableStats tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); @@ -124,7 +124,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { protected void createAnalyzeJobForTbl(DatabaseIf db, List analysisInfos, TableIf table) { - AnalysisMethod analysisMethod = table.getDataSize() > Config.huge_table_lower_bound_size_in_bytes + AnalysisMethod analysisMethod = table.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes ? AnalysisMethod.SAMPLE : AnalysisMethod.FULL; TableName tableName = new TableName(db.getCatalog().getName(), db.getFullName(), table.getName()); @@ -141,7 +141,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { .setAnalysisType(AnalysisInfo.AnalysisType.FUNDAMENTALS) .setAnalysisMode(AnalysisInfo.AnalysisMode.INCREMENTAL) .setAnalysisMethod(analysisMethod) - .setSamplePercent(Config.huge_table_default_sample_rows) + .setSampleRows(Config.huge_table_default_sample_rows) .setScheduleType(ScheduleType.AUTOMATIC) .setState(AnalysisState.PENDING) .setTaskIds(new ArrayList<>()) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java index f65829e748..2d5c481683 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java @@ -23,12 +23,15 @@ import org.apache.doris.common.util.MasterDaemon; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.hudi.common.util.VisibleForTesting; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.HashMap; import java.util.Map; public abstract class StatisticsCollector extends MasterDaemon { + private static final Logger LOG = LogManager.getLogger(StatisticsCollector.class); protected final AnalysisTaskExecutor analysisTaskExecutor; @@ -45,6 +48,7 @@ public abstract class StatisticsCollector extends MasterDaemon { return; } if (!StatisticsUtil.statsTblAvailable()) { + LOG.info("Stats table not available, skip"); return; } if (Env.isCheckpointThread()) { @@ -52,6 +56,7 @@ public abstract class StatisticsCollector extends MasterDaemon { } if (!analysisTaskExecutor.idle()) { + LOG.info("Analyze tasks those submitted in last time is not finished, skip"); return; } collect(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 1cb131099c..dd4e4f8515 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -174,6 +174,7 @@ public class StatisticsUtil { sessionVariable.setMaxExecMemByte(Config.statistics_sql_mem_limit_in_bytes); sessionVariable.cpuResourceLimit = Config.cpu_resource_limit_per_analyze_task; sessionVariable.setEnableInsertStrict(true); + sessionVariable.enablePageCache = false; sessionVariable.parallelExecInstanceNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.parallelPipelineTaskNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.setEnableNereidsPlanner(false); @@ -729,15 +730,14 @@ public class StatisticsUtil { return table instanceof ExternalTable; } - public static boolean checkAnalyzeTime(LocalTime now) { + public static boolean inAnalyzeTime(LocalTime now) { try { Pair range = findRangeFromGlobalSessionVar(); - DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); - LocalTime start = range == null - ? LocalTime.parse(Config.full_auto_analyze_start_time, timeFormatter) : range.first; - LocalTime end = range == null - ? LocalTime.parse(Config.full_auto_analyze_end_time, timeFormatter) : range.second; - + if (range == null) { + return false; + } + LocalTime start = range.first; + LocalTime end = range.second; if (start.isAfter(end) && (now.isAfter(start) || now.isBefore(end))) { return true; } else { @@ -754,13 +754,14 @@ public class StatisticsUtil { String startTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_START_TIME) .fullAutoAnalyzeStartTime; + // For compatibility if (StringUtils.isEmpty(startTime)) { - return null; + startTime = StatisticConstants.FULL_AUTO_ANALYZE_START_TIME; } String endTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_END_TIME) .fullAutoAnalyzeEndTime; if (StringUtils.isEmpty(startTime)) { - return null; + endTime = StatisticConstants.FULL_AUTO_ANALYZE_END_TIME; } DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); return Pair.of(LocalTime.parse(startTime, timeFormatter), LocalTime.parse(endTime, timeFormatter)); @@ -775,4 +776,13 @@ public class StatisticsUtil { VariableMgr.getValue(sessionVariable, variableExpr); return sessionVariable; } + + public static boolean enableAutoAnalyze() { + try { + return findRangeFromGlobalSessionVar(SessionVariable.ENABLE_FULL_AUTO_ANALYZE).enableFullAutoAnalyze; + } catch (Exception e) { + LOG.warn("Fail to get value of enable auto analyze, return false by default", e); + } + return false; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 185ae1f6f8..37fef3ebc8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -34,7 +34,7 @@ public class OlapAnalysisTaskTest { public void testAutoSample(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked TableIf tableIf) { new Expectations() { { - tableIf.getDataSize(); + tableIf.getDataSize(true); result = 60_0000_0000L; } }; @@ -46,11 +46,11 @@ public class OlapAnalysisTaskTest { olapAnalysisTask.tbl = tableIf; Config.enable_auto_sample = true; String sampleExpr = olapAnalysisTask.getSampleExpression(); - Assertions.assertEquals("TABLESAMPLE(200000 ROWS)", sampleExpr); + Assertions.assertEquals("TABLESAMPLE(4194304 ROWS)", sampleExpr); new Expectations() { { - tableIf.getDataSize(); + tableIf.getDataSize(true); result = 1_0000_0000L; } };