From 1130317b91d5a83282a1cb0b144449cee95cf5e4 Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Tue, 17 Oct 2023 10:31:57 +0800 Subject: [PATCH] [Improvement](statistics)Collect stats for hive partition column using metadata (#24853) Hive partition columns' stats could be calculated from hive metastore data. Doesn't need to execute sql to get the stats. This PR is using hive partition metadata to collect partition column stats. --- docs/en/docs/query-acceleration/statistics.md | 5 +- .../docs/query-acceleration/statistics.md | 5 +- fe/fe-core/src/main/cup/sql_parser.cup | 7 + .../doris/analysis/AnalyzeProperties.java | 7 +- .../apache/doris/analysis/AnalyzeStmt.java | 4 + .../datasource/hive/HiveMetaStoreCache.java | 14 +- .../doris/external/hive/util/HiveUtil.java | 16 ++ .../apache/doris/statistics/AnalysisInfo.java | 8 +- .../doris/statistics/AnalysisInfoBuilder.java | 20 +- .../doris/statistics/AnalysisManager.java | 1 + .../doris/statistics/BaseAnalysisTask.java | 18 ++ .../doris/statistics/HMSAnalysisTask.java | 154 ++++++++++-- .../doris/statistics/util/StatisticsUtil.java | 5 + fe/fe-core/src/main/jflex/sql_scanner.flex | 1 + .../test_hive_partition_column_analyze.groovy | 230 ++++++++++++++++++ 15 files changed, 447 insertions(+), 48 deletions(-) create mode 100644 regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index 127240626e..4bb3d941b0 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -52,9 +52,9 @@ Syntax: ```SQL ANALYZE < TABLE | DATABASE table_name | db_name > - [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT] ] + [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT ] ] [ (column_name [, ...]) ] - [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ]] + [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH SQL ] ] [ PROPERTIES ("key" = "value", ...) ]; ``` @@ -65,6 +65,7 @@ Where: - `column_name`: Specifies the target column. It must be an existing column in `table_name`, and multiple column names are separated by commas. - `sync`: Collect statistics synchronously. Returns upon completion. If not specified, it executes asynchronously and returns a task ID. - `sample percent | rows`: Collect statistics using sampling. You can specify either the sampling percentage or the number of sampled rows. +- `sql`: Collect statistics for external partition column with sql. By default, it uses meta data for partition columns, which is faster but may inaccurate for row count and size. Using sql could collect the accurate stats. ### Automatic Statistics Collection diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index 9bd69f9448..fd3066995e 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -51,9 +51,9 @@ Doris支持用户通过提交ANALYZE语句来触发统计信息的收集和更 ```SQL ANALYZE < TABLE | DATABASE table_name | db_name > - [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT] ] + [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT ] ] [ (column_name [, ...]) ] - [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ]] + [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH SQL ] ] [ PROPERTIES ("key" = "value", ...) ]; ``` @@ -64,6 +64,7 @@ ANALYZE < TABLE | DATABASE table_name | db_name > - column_name: 指定的目标列。必须是  `table_name`  中存在的列,多个列名称用逗号分隔。 - sync:同步收集统计信息。收集完后返回。若不指定则异步执行并返回任务 ID。 - sample percent | rows:抽样收集统计信息。可以指定抽样比例或者抽样行数。 +- sql:执行sql来收集外表分区列统计信息。默认从元数据收集分区列信息,这样效率比较高但是行数和数据量大小可能不准。用户可以指定使用sql来收集,这样可以收集到准确的分区列信息。 ### 自动收集 diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 989a28d548..4b6c74fbe9 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -577,6 +577,7 @@ terminal String KW_SNAPSHOT, KW_SONAME, KW_SPLIT, + KW_SQL, KW_SQL_BLOCK_RULE, KW_START, KW_STARTS, @@ -6142,6 +6143,12 @@ with_analysis_properties ::= put(AnalyzeProperties.PROPERTY_FORCE_FULL, "true"); }}; :} + | KW_SQL + {: + RESULT = new HashMap() {{ + put(AnalyzeProperties.PROPERTY_PARTITION_COLUMN_FROM_SQL, "true"); + }}; + :} ; diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java index d7e639da3a..4b5f161d2b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java @@ -42,8 +42,8 @@ public class AnalyzeProperties { public static final String PROPERTY_NUM_BUCKETS = "num.buckets"; public static final String PROPERTY_ANALYSIS_TYPE = "analysis.type"; public static final String PROPERTY_PERIOD_SECONDS = "period.seconds"; - public static final String PROPERTY_FORCE_FULL = "force.full"; + public static final String PROPERTY_PARTITION_COLUMN_FROM_SQL = "partition.column.from.sql"; public static final AnalyzeProperties DEFAULT_PROP = new AnalyzeProperties(new HashMap() { { @@ -71,6 +71,7 @@ public class AnalyzeProperties { .add(PROPERTY_PERIOD_SECONDS) .add(PROPERTY_PERIOD_CRON) .add(PROPERTY_FORCE_FULL) + .add(PROPERTY_PARTITION_COLUMN_FROM_SQL) .build(); public AnalyzeProperties(Map properties) { @@ -276,6 +277,10 @@ public class AnalyzeProperties { return properties.containsKey(PROPERTY_SAMPLE_ROWS); } + public boolean usingSqlForPartitionColumn() { + return properties.containsKey(PROPERTY_PARTITION_COLUMN_FROM_SQL); + } + public String toSQL() { StringBuilder sb = new StringBuilder(); sb.append("PROPERTIES("); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java index ae2c6a7ff4..2c4a51757c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java @@ -97,4 +97,8 @@ public class AnalyzeStmt extends StatementBase { public boolean forceFull() { return analyzeProperties.forceFull(); } + + public boolean usingSqlForPartitionColumn() { + return analyzeProperties.usingSqlForPartitionColumn(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index e5a43f3bb0..8c9b5552a7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -87,10 +87,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.FileNotFoundException; -import java.io.UnsupportedEncodingException; import java.net.URI; -import java.net.URLDecoder; -import java.nio.charset.StandardCharsets; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.HashMap; @@ -298,16 +295,7 @@ public class HiveMetaStoreCache { Preconditions.checkState(parts.length == types.size(), partitionName + " vs. " + types); List values = Lists.newArrayListWithExpectedSize(types.size()); for (String part : parts) { - String[] kv = part.split("="); - Preconditions.checkState(kv.length == 2, partitionName); - String partitionValue; - try { - // hive partition value maybe contains special characters like '=' and '/' - partitionValue = URLDecoder.decode(kv[1], StandardCharsets.UTF_8.name()); - } catch (UnsupportedEncodingException e) { - // It should not be here - throw new RuntimeException(e); - } + String partitionValue = HiveUtil.getHivePartitionValue(part); values.add(new PartitionValue(partitionValue, HIVE_DEFAULT_PARTITION.equals(partitionValue))); } try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java index 85b8034fe7..deb048b594 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java @@ -27,6 +27,7 @@ import org.apache.doris.fs.FileSystemFactory; import org.apache.doris.fs.remote.BrokerFileSystem; import org.apache.doris.fs.remote.RemoteFileSystem; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -46,8 +47,11 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.List; /** @@ -221,4 +225,16 @@ public final class HiveUtil { } } + public static String getHivePartitionValue(String part) { + String[] kv = part.split("="); + Preconditions.checkState(kv.length == 2, String.format("Malformed partition name %s", part)); + try { + // hive partition value maybe contains special characters like '=' and '/' + return URLDecoder.decode(kv[1], StandardCharsets.UTF_8.name()); + } catch (UnsupportedEncodingException e) { + // It should not be here + throw new RuntimeException(e); + } + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java index 2bf06d0c2c..9c298a6ea8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java @@ -179,13 +179,17 @@ public class AnalysisInfo implements Writable { @SerializedName("forceFull") public final boolean forceFull; + @SerializedName("usingSqlForPartitionColumn") + public final boolean usingSqlForPartitionColumn; + public AnalysisInfo(long jobId, long taskId, List taskIds, long catalogId, long dbId, long tblId, Map> colToPartitions, Set partitionNames, String colName, Long indexId, JobType jobType, AnalysisMode analysisMode, AnalysisMethod analysisMethod, AnalysisType analysisType, int samplePercent, long sampleRows, int maxBucketNum, long periodTimeInMs, String message, long lastExecTimeInMs, long timeCostInMs, AnalysisState state, ScheduleType scheduleType, boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition, - boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull) { + boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull, + boolean usingSqlForPartitionColumn) { this.jobId = jobId; this.taskId = taskId; this.taskIds = taskIds; @@ -219,6 +223,7 @@ public class AnalysisInfo implements Writable { this.cronExprStr = cronExpression.getCronExpression(); } this.forceFull = forceFull; + this.usingSqlForPartitionColumn = usingSqlForPartitionColumn; } @Override @@ -259,6 +264,7 @@ public class AnalysisInfo implements Writable { sj.add("cronExpr: " + cronExprStr); } sj.add("forceFull: " + forceFull); + sj.add("usingSqlForPartitionColumn: " + usingSqlForPartitionColumn); return sj.toString(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index b6dc7b5dd7..18cf3e3079 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -60,6 +60,7 @@ public class AnalysisInfoBuilder { private long partitionCount; private CronExpression cronExpression; private boolean forceFull; + private boolean usingSqlForPartitionColumn; public AnalysisInfoBuilder() { } @@ -95,6 +96,7 @@ public class AnalysisInfoBuilder { partitionCount = info.partitionCount; cronExpression = info.cronExpression; forceFull = info.forceFull; + usingSqlForPartitionColumn = info.usingSqlForPartitionColumn; } public AnalysisInfoBuilder setJobId(long jobId) { @@ -237,12 +239,19 @@ public class AnalysisInfoBuilder { return this; } - public void setCronExpression(CronExpression cronExpression) { + public AnalysisInfoBuilder setCronExpression(CronExpression cronExpression) { this.cronExpression = cronExpression; + return this; } - public void setForceFull(boolean forceFull) { + public AnalysisInfoBuilder setForceFull(boolean forceFull) { this.forceFull = forceFull; + return this; + } + + public AnalysisInfoBuilder setUsingSqlForPartitionColumn(boolean usingSqlForPartitionColumn) { + this.usingSqlForPartitionColumn = usingSqlForPartitionColumn; + return this; } public AnalysisInfo build() { @@ -250,7 +259,7 @@ public class AnalysisInfoBuilder { colName, indexId, jobType, analysisMode, analysisMethod, analysisType, samplePercent, sampleRows, maxBucketNum, periodTimeInMs, message, lastExecTimeInMs, timeCostInMs, state, scheduleType, externalTableLevelTask, partitionOnly, samplingPartition, isAllPartition, partitionCount, - cronExpression, forceFull); + cronExpression, forceFull, usingSqlForPartitionColumn); } public AnalysisInfoBuilder copy() { @@ -281,6 +290,9 @@ public class AnalysisInfoBuilder { .setSamplingPartition(samplingPartition) .setPartitionOnly(partitionOnly) .setAllPartition(isAllPartition) - .setPartitionCount(partitionCount); + .setPartitionCount(partitionCount) + .setCronExpression(cronExpression) + .setForceFull(forceFull) + .setUsingSqlForPartitionColumn(usingSqlForPartitionColumn); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 1099fd3e1f..01c4e3c25b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -537,6 +537,7 @@ public class AnalysisManager extends Daemon implements Writable { infoBuilder.setLastExecTimeInMs(0); infoBuilder.setCronExpression(cronExpression); infoBuilder.setForceFull(stmt.forceFull()); + infoBuilder.setUsingSqlForPartitionColumn(stmt.usingSqlForPartitionColumn()); if (analysisMethod == AnalysisMethod.SAMPLE) { infoBuilder.setSamplePercent(samplePercent); infoBuilder.setSampleRows(sampleRows); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 04b775bcd6..1377fe05ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -93,6 +93,24 @@ public abstract class BaseAnalysisTask { + " ${internalDB}.${columnStatTbl}.part_id IS NOT NULL" + " ) t1, \n"; + protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = "INSERT INTO " + + "${internalDB}.${columnStatTbl}" + + " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "NULL AS part_id, " + + "${row_count} AS row_count, " + + "${ndv} AS ndv, " + + "${null_count} AS null_count, " + + "'${min}' AS min, " + + "'${max}' AS max, " + + "${data_size} AS data_size, " + + "NOW() "; + protected AnalysisInfo info; protected CatalogIf> catalog; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 7ef87ac940..df2396de03 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -21,12 +21,15 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.FeConstants; +import org.apache.doris.datasource.hive.HiveMetaStoreCache; +import org.apache.doris.external.hive.util.HiveUtil; import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.QueryState; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -118,7 +121,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { * Get table row count */ private void getTableStats() throws Exception { - Map params = buildTableStatsParams(null); + Map params = buildStatsParams(null); List columnResult = StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) .replace(ANALYZE_TABLE_COUNT_TEMPLATE)); @@ -132,6 +135,33 @@ public class HMSAnalysisTask extends BaseAnalysisTask { * Get column statistics and insert the result to __internal_schema.column_statistics */ private void getTableColumnStats() throws Exception { + if (isPartitionOnly) { + getPartitionNames(); + List partitionAnalysisSQLs = new ArrayList<>(); + for (String partId : this.partitionNames) { + partitionAnalysisSQLs.add(generateSqlForPartition(partId)); + } + execSQLs(partitionAnalysisSQLs); + } else { + if (!info.usingSqlForPartitionColumn && isPartitionColumn()) { + try { + getPartitionColumnStats(); + } catch (Exception e) { + LOG.warn("Failed to collect stats for partition col {} using metadata, " + + "fallback to normal collection", col.getName(), e); + getOrdinaryColumnStats(); + } + } else { + getOrdinaryColumnStats(); + } + } + } + + private boolean isPartitionColumn() { + return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName())); + } + + private void getOrdinaryColumnStats() throws Exception { // An example sql for a column stats: // INSERT INTO __internal_schema.column_statistics // SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id, @@ -148,26 +178,100 @@ public class HMSAnalysisTask extends BaseAnalysisTask { // MAX(`r_regionkey`) AS max, // 0 AS data_size, // NOW() FROM `hive`.`tpch100`.`region` - if (isPartitionOnly) { - getPartitionNames(); - List partitionAnalysisSQLs = new ArrayList<>(); - for (String partId : this.partitionNames) { - partitionAnalysisSQLs.add(generateSqlForPartition(partId)); + StringBuilder sb = new StringBuilder(); + sb.append(ANALYZE_TABLE_TEMPLATE); + Map params = buildStatsParams("NULL"); + params.put("dataSizeFunction", getDataSizeFunction(col)); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String sql = stringSubstitutor.replace(sb.toString()); + executeInsertSql(sql); + } + + private void getPartitionColumnStats() throws Exception { + Set partitionNames = table.getPartitionNames(); + Set ndvPartValues = Sets.newHashSet(); + long numNulls = 0; + long dataSize = 0; + String min = null; + String max = null; + for (String names : partitionNames) { + // names is like "date=20230101" for one level partition + // and like "date=20230101/hour=12" for two level partition + String[] parts = names.split("/"); + for (String part : parts) { + if (part.startsWith(col.getName())) { + String value = HiveUtil.getHivePartitionValue(part); + // HIVE_DEFAULT_PARTITION hive partition value when the partition name is not specified. + if (value == null || value.isEmpty() || value.equals(HiveMetaStoreCache.HIVE_DEFAULT_PARTITION)) { + numNulls += 1; + continue; + } + ndvPartValues.add(value); + dataSize += col.getType().isStringType() ? value.length() : col.getType().getSlotSize(); + min = updateMinValue(min, value); + max = updateMaxValue(max, value); + } } - execSQLs(partitionAnalysisSQLs); - } else { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_TABLE_TEMPLATE); - Map params = buildTableStatsParams("NULL"); - params.put("internalDB", FeConstants.INTERNAL_DB_NAME); - params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); - params.put("colName", col.getName()); - params.put("colId", info.colName); - params.put("dataSizeFunction", getDataSizeFunction(col)); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(sb.toString()); - executeInsertSql(sql); } + // Estimate the row count. This value is inaccurate if the table stats is empty. + TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); + long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount; + dataSize = dataSize * count / partitionNames.size(); + numNulls = numNulls * count / partitionNames.size(); + int ndv = ndvPartValues.size(); + + Map params = buildStatsParams("NULL"); + params.put("row_count", String.valueOf(count)); + params.put("ndv", String.valueOf(ndv)); + params.put("null_count", String.valueOf(numNulls)); + params.put("min", min); + params.put("max", max); + params.put("data_size", String.valueOf(dataSize)); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String sql = stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE); + executeInsertSql(sql); + } + + private String updateMinValue(String currentMin, String value) { + if (currentMin == null) { + return value; + } + if (col.getType().isFixedPointType()) { + if (Long.parseLong(value) < Long.parseLong(currentMin)) { + return value; + } else { + return currentMin; + } + } + if (col.getType().isFloatingPointType() || col.getType().isDecimalV2() || col.getType().isDecimalV3()) { + if (Double.parseDouble(value) < Double.parseDouble(currentMin)) { + return value; + } else { + return currentMin; + } + } + return value.compareTo(currentMin) < 0 ? value : currentMin; + } + + private String updateMaxValue(String currentMax, String value) { + if (currentMax == null) { + return value; + } + if (col.getType().isFixedPointType()) { + if (Long.parseLong(value) > Long.parseLong(currentMax)) { + return value; + } else { + return currentMax; + } + } + if (col.getType().isFloatingPointType() || col.getType().isDecimalV2() || col.getType().isDecimalV3()) { + if (Double.parseDouble(value) > Double.parseDouble(currentMax)) { + return value; + } else { + return currentMax; + } + } + return value.compareTo(currentMax) > 0 ? value : currentMax; } private void getPartitionNames() { @@ -198,11 +302,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { sb.append(" and "); } } - Map params = buildTableStatsParams(partId); - params.put("internalDB", FeConstants.INTERNAL_DB_NAME); - params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); - params.put("colName", col.getName()); - params.put("colId", info.colName); + Map params = buildStatsParams(partId); params.put("dataSizeFunction", getDataSizeFunction(col)); return new StringSubstitutor(params).replace(sb.toString()); } @@ -262,7 +362,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { } } - private Map buildTableStatsParams(String partId) { + private Map buildStatsParams(String partId) { Map commonParams = new HashMap<>(); String id = StatisticsUtil.constructId(tbl.getId(), -1); if (partId == null) { @@ -271,12 +371,16 @@ public class HMSAnalysisTask extends BaseAnalysisTask { id = StatisticsUtil.constructId(id, partId); commonParams.put("partId", "\'" + partId + "\'"); } + commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME); + commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); commonParams.put("id", id); commonParams.put("catalogId", String.valueOf(catalog.getId())); commonParams.put("dbId", String.valueOf(db.getId())); commonParams.put("tblId", String.valueOf(tbl.getId())); commonParams.put("indexId", "-1"); commonParams.put("idxId", "-1"); + commonParams.put("colName", info.colName); + commonParams.put("colId", info.colName); commonParams.put("catalogName", catalog.getName()); commonParams.put("dbName", db.getFullName()); commonParams.put("tblName", tbl.getName()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index dba5075aae..169ac3e338 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -629,7 +629,12 @@ public class StatisticsUtil { } // Estimate row count: totalSize/estimatedRowSize long estimatedRowSize = 0; + List partitionColumns = table.getPartitionColumns(); for (Column column : table.getFullSchema()) { + // Partition column shouldn't count to the row size, because it is not in the data file. + if (partitionColumns != null && partitionColumns.contains(column)) { + continue; + } estimatedRowSize += column.getDataType().getSlotSize(); } if (estimatedRowSize == 0) { diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index 10fa8600ac..d4423f7d4c 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -429,6 +429,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("snapshot", new Integer(SqlParserSymbols.KW_SNAPSHOT)); keywordMap.put("soname", new Integer(SqlParserSymbols.KW_SONAME)); keywordMap.put("split", new Integer(SqlParserSymbols.KW_SPLIT)); + keywordMap.put("sql", new Integer(SqlParserSymbols.KW_SQL)); keywordMap.put("sql_block_rule", new Integer(SqlParserSymbols.KW_SQL_BLOCK_RULE)); keywordMap.put("sample", new Integer(SqlParserSymbols.KW_SAMPLE)); keywordMap.put("start", new Integer(SqlParserSymbols.KW_START)); diff --git a/regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy b/regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy new file mode 100644 index 0000000000..42aeea1524 --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_partition_column_analyze", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_partition_column_analyze" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + // Test analyze table without init. + sql """analyze table ${catalog_name}.multi_partition.multi_partition_parquet (event_day) with sync""" + sql """analyze table ${catalog_name}.multi_partition.multi_partition_orc (event_day) with sync""" + + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """use multi_partition;""" + def result = sql """show column stats multi_partition_parquet (event_day)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "event_day") + assertTrue(result[0][1] == "3.83714205E8") + assertTrue(result[0][2] == "99949.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "3.83714205E9") + assertTrue(result[0][5] == "10.0") + assertTrue(result[0][6] == "\'1749-09-24\'") + assertTrue(result[0][7] == "\'2023-05-26\'") + + result = sql """show column stats multi_partition_orc (event_day)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "event_day") + assertTrue(result[0][1] == "1.9007155E8") + assertTrue(result[0][2] == "99949.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1.9007155E9") + assertTrue(result[0][5] == "10.0") + assertTrue(result[0][6] == "\'1749-09-24\'") + assertTrue(result[0][7] == "\'2023-05-26\'") + + sql """analyze table ${catalog_name}.partition_type.tinyint_partition (tinyint_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.smallint_partition (smallint_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.int_partition (int_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.bigint_partition (bigint_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.char_partition (char_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.varchar_partition (varchar_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.string_partition (string_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.date_partition (date_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.float_partition (float_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.double_partition (double_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.decimal_partition (decimal_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.two_partition (part1, part2) with sync""" + + sql """use partition_type;""" + + result = sql """show column stats tinyint_partition (tinyint_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "tinyint_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "141474.0") + assertTrue(result[0][5] == "1.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "100") + + result = sql """show column stats smallint_partition (smallint_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "smallint_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "282948.0") + assertTrue(result[0][5] == "2.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "100") + + result = sql """show column stats int_partition (int_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "int_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "565896.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "100") + + result = sql """show column stats bigint_partition (bigint_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "bigint_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1131792.0") + assertTrue(result[0][5] == "8.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "100") + + result = sql """show column stats char_partition (char_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "char_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "101.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "2829480.0") + assertTrue(result[0][5] == "20.0") + assertTrue(result[0][6] == "\'1 \'") + assertTrue(result[0][7] == "\'a \'") + + result = sql """show column stats varchar_partition (varchar_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "varchar_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "271630.0") + assertTrue(result[0][5] == "1.9199994345250717") + assertTrue(result[0][6] == "\'1\'") + assertTrue(result[0][7] == "\'99\'") + + result = sql """show column stats string_partition (string_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "string_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "271630.0") + assertTrue(result[0][5] == "1.9199994345250717") + assertTrue(result[0][6] == "\'1\'") + assertTrue(result[0][7] == "\'99\'") + + result = sql """show column stats date_partition (date_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "date_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "565896.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "\'2001-10-12\'") + assertTrue(result[0][7] == "\'2100-10-12\'") + + result = sql """show column stats float_partition (float_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "float_part") + assertTrue(result[0][1] == "117416.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "469664.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "296.31") + assertTrue(result[0][7] == "32585.627") + + result = sql """show column stats double_partition (double_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "double_part") + assertTrue(result[0][1] == "16987.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "135896.0") + assertTrue(result[0][5] == "8.0") + assertTrue(result[0][6] == "115.145") + assertTrue(result[0][7] == "32761.145") + + result = sql """show column stats decimal_partition (decimal_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "decimal_part") + assertTrue(result[0][1] == "141474.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1131792.0") + assertTrue(result[0][5] == "8.0") + assertTrue(result[0][6] == "243.2868") + assertTrue(result[0][7] == "32527.1543") + + sql """analyze table ${catalog_name}.partition_type.decimal_partition (decimal_part) with sync with sql""" + result = sql """show column stats decimal_partition (decimal_part)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "decimal_part") + assertTrue(result[0][1] == "100000.0") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "800000.0") + assertTrue(result[0][5] == "8.0") + assertTrue(result[0][6] == "243.2868") + assertTrue(result[0][7] == "32527.1543") + + result = sql """show column stats two_partition (part1)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "part1") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "100") + + result = sql """show column stats two_partition (part2)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "part2") + assertTrue(result[0][2] == "100.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][6] == "\'1\'") + assertTrue(result[0][7] == "\'99\'") + + sql """drop catalog ${catalog_name}""" + } +} +