[feature](statistics)support statistics for iceberg/paimon/hudi table (#29868)

2024-01-17 17:43:05 +08:00
parent ade720470d
commit 44ba9e102c
21 changed files with 621 additions and 237 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
@ -23,7 +23,10 @@ import org.apache.doris.catalog.HiveMetaStoreClientHelper;
 import org.apache.doris.catalog.ScalarType;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.datasource.iceberg.IcebergExternalCatalog;
+import org.apache.doris.statistics.AnalysisInfo;
+import org.apache.doris.statistics.BaseAnalysisTask;
 import org.apache.doris.statistics.ColumnStatistic;
+import org.apache.doris.statistics.ExternalAnalysisTask;
 import org.apache.doris.statistics.util.StatisticsUtil;
 import org.apache.doris.thrift.THiveTable;
 import org.apache.doris.thrift.TIcebergTable;
@ -149,4 +152,10 @@ public class IcebergExternalTable extends ExternalTable {
                () -> StatisticsUtil.getIcebergColumnStats(colName,
                        ((IcebergExternalCatalog) catalog).getIcebergTable(dbName, name)));
    }
+
+    @Override
+    public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
+        makeSureInitialized();
+        return new ExternalAnalysisTask(info);
+    }
 }
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
@ -21,6 +21,9 @@ import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.ScalarType;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.datasource.paimon.PaimonExternalCatalog;
+import org.apache.doris.statistics.AnalysisInfo;
+import org.apache.doris.statistics.BaseAnalysisTask;
+import org.apache.doris.statistics.ExternalAnalysisTask;
 import org.apache.doris.thrift.THiveTable;
 import org.apache.doris.thrift.TTableDescriptor;
 import org.apache.doris.thrift.TTableType;
@ -154,4 +157,10 @@ public class PaimonExternalTable extends ExternalTable {
                    + getPaimonCatalogType());
        }
    }
+
+    @Override
+    public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
+        makeSureInitialized();
+        return new ExternalAnalysisTask(info);
+    }
 }
--- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
@ -27,6 +27,7 @@ import com.google.common.base.Preconditions;
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
+import org.apache.log4j.Logger;

 import java.util.ArrayList;
 import java.util.Collections;
@ -37,6 +38,7 @@ import java.util.List;
 * show all catalogs' info
 */
 public class CatalogsProcDir implements ProcDirInterface {
+    private static final Logger LOG = Logger.getLogger(CatalogsProcDir.class);
    public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
            .add("CatalogIds").add("CatalogName").add("DatabaseNum").add("LastUpdateTime")
            .build();
@ -90,7 +92,13 @@ public class CatalogsProcDir implements ProcDirInterface {
            List<Comparable> catalogInfo = Lists.newArrayList();
            catalogInfo.add(catalog.getId());
            catalogInfo.add(catalog.getName());
-            catalogInfo.add(catalog.getDbNames().size());
+            int size = -1;
+            try {
+                size = catalog.getDbNames().size();
+            } catch (Exception e) {
+                LOG.warn("failed to get database: ", e);
+            }
+            catalogInfo.add(size);
            catalogInfo.add(TimeUtils.longToTimeString(catalog.getLastUpdateTime()));
            catalogInfos.add(catalogInfo);
        }
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.statistics;
+
+import org.apache.doris.catalog.Column;
+import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.external.ExternalTable;
+import org.apache.doris.common.FeConstants;
+import org.apache.doris.common.Pair;
+import org.apache.doris.statistics.util.StatisticsUtil;
+
+import org.apache.commons.text.StringSubstitutor;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+public class ExternalAnalysisTask extends BaseAnalysisTask {
+    private static final Logger LOG = LogManager.getLogger(ExternalAnalysisTask.class);
+
+    private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
+            + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
+    private boolean isTableLevelTask;
+    private boolean isPartitionOnly;
+    private ExternalTable table;
+
+    // For test
+    public ExternalAnalysisTask() {
+    }
+
+    public ExternalAnalysisTask(AnalysisInfo info) {
+        super(info);
+        isTableLevelTask = info.externalTableLevelTask;
+        isPartitionOnly = info.partitionOnly;
+        table = (ExternalTable) tbl;
+    }
+
+    public void doExecute() throws Exception {
+        if (isTableLevelTask) {
+            getTableStats();
+        } else {
+            getOrdinaryColumnStats();
+        }
+    }
+
+    // For test
+    protected void setTable(ExternalTable table) {
+        this.table = table;
+    }
+
+    /**
+     * Get table row count
+     */
+    private void getTableStats() {
+        Map<String, String> params = buildStatsParams(null);
+        List<ResultRow> columnResult =
+                StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
+                        .replace(ANALYZE_TABLE_COUNT_TEMPLATE));
+        String rowCount = columnResult.get(0).get(0);
+        Env.getCurrentEnv().getAnalysisManager()
+                .updateTableStatsStatus(
+                        new TableStatsMeta(Long.parseLong(rowCount), info, tbl));
+        job.rowCountDone(this);
+    }
+
+    // Get ordinary column stats
+    protected void getOrdinaryColumnStats() throws Exception {
+        StringBuilder sb = new StringBuilder();
+        Map<String, String> params = buildStatsParams("NULL");
+        params.put("min", getMinFunction());
+        params.put("max", getMaxFunction());
+        params.put("dataSizeFunction", getDataSizeFunction(col, false));
+        Pair<Double, Long> sampleInfo = getSampleInfo();
+        params.put("scaleFactor", String.valueOf(sampleInfo.first));
+        StringSubstitutor stringSubstitutor;
+        if (tableSample == null) {
+            // Do full analyze
+            LOG.debug("Will do full collection for column {}", col.getName());
+            sb.append(COLLECT_COL_STATISTICS);
+        } else {
+            // Do sample analyze
+            LOG.debug("Will do sample collection for column {}", col.getName());
+            boolean limitFlag = false;
+            boolean bucketFlag = false;
+            // If sample size is too large, use limit to control the sample size.
+            if (needLimit(sampleInfo.second, sampleInfo.first)) {
+                limitFlag = true;
+                long columnSize = 0;
+                for (Column column : table.getFullSchema()) {
+                    columnSize += column.getDataType().getSlotSize();
+                }
+                double targetRows = (double) sampleInfo.second / columnSize;
+                // Estimate the new scaleFactor based on the schema.
+                if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
+                    params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows());
+                    params.put("scaleFactor",
+                            String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
+                }
+            }
+            // Single distribution column is not fit for DUJ1 estimator, use linear estimator.
+            Set<String> distributionColumns = tbl.getDistributionColumnNames();
+            if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
+                bucketFlag = true;
+                sb.append(LINEAR_ANALYZE_TEMPLATE);
+                params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
+                params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
+            } else {
+                sb.append(DUJ1_ANALYZE_TEMPLATE);
+                params.put("dataSizeFunction", getDataSizeFunction(col, true));
+                params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
+                params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
+            }
+            LOG.info("Sample for column [{}]. Scale factor [{}], "
+                    + "limited [{}], is distribute column [{}]",
+                    col.getName(), params.get("scaleFactor"), limitFlag, bucketFlag);
+        }
+        stringSubstitutor = new StringSubstitutor(params);
+        String sql = stringSubstitutor.replace(sb.toString());
+        runQuery(sql);
+    }
+
+    protected Map<String, String> buildStatsParams(String partId) {
+        Map<String, String> commonParams = new HashMap<>();
+        String id = StatisticsUtil.constructId(tbl.getId(), -1);
+        if (partId == null) {
+            commonParams.put("partId", "NULL");
+        } else {
+            id = StatisticsUtil.constructId(id, partId);
+            commonParams.put("partId", "\'" + partId + "\'");
+        }
+        commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
+        commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
+        commonParams.put("id", id);
+        commonParams.put("catalogId", String.valueOf(catalog.getId()));
+        commonParams.put("dbId", String.valueOf(db.getId()));
+        commonParams.put("tblId", String.valueOf(tbl.getId()));
+        commonParams.put("indexId", "-1");
+        commonParams.put("idxId", "-1");
+        commonParams.put("colName", info.colName);
+        commonParams.put("colId", info.colName);
+        commonParams.put("catalogName", catalog.getName());
+        commonParams.put("dbName", db.getFullName());
+        commonParams.put("tblName", tbl.getName());
+        commonParams.put("sampleHints", getSampleHint());
+        commonParams.put("limit", "");
+        commonParams.put("scaleFactor", "1");
+        if (col != null) {
+            commonParams.put("type", col.getType().toString());
+        }
+        commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis()));
+        return commonParams;
+    }
+
+    protected String getSampleHint() {
+        if (tableSample == null) {
+            return "";
+        }
+        if (tableSample.isPercent()) {
+            return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue());
+        } else {
+            return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue());
+        }
+    }
+
+    /**
+     * Get the pair of sample scale factor and the file size going to sample.
+     * While analyzing, the result of count, null count and data size need to
+     * multiply this scale factor to get more accurate result.
+     * @return Pair of sample scale factor and the file size going to sample.
+     */
+    protected Pair<Double, Long> getSampleInfo() {
+        if (tableSample == null) {
+            return Pair.of(1.0, 0L);
+        }
+        long target;
+        // Get list of all files' size in this HMS table.
+        List<Long> chunkSizes = table.getChunkSizes();
+        Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
+        long total = 0;
+        // Calculate the total size of this HMS table.
+        for (long size : chunkSizes) {
+            total += size;
+        }
+        if (total == 0) {
+            return Pair.of(1.0, 0L);
+        }
+        // Calculate the sample target size for percent and rows sample.
+        if (tableSample.isPercent()) {
+            target = total * tableSample.getSampleValue() / 100;
+        } else {
+            int columnSize = 0;
+            for (Column column : table.getFullSchema()) {
+                columnSize += column.getDataType().getSlotSize();
+            }
+            target = columnSize * tableSample.getSampleValue();
+        }
+        // Calculate the actual sample size (cumulate).
+        long cumulate = 0;
+        for (long size : chunkSizes) {
+            cumulate += size;
+            if (cumulate >= target) {
+                break;
+            }
+        }
+        return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
+    }
+
+    @Override
+    protected void afterExecution() {
+        // Table level task doesn't need to sync any value to sync stats, it stores the value in metadata.
+        // Partition only task doesn't need to refresh cached.
+        if (isTableLevelTask || isPartitionOnly) {
+            return;
+        }
+        Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName());
+    }
+
+    /**
+     * If the size to sample is larger than LIMIT_SIZE (1GB)
+     * and is much larger (1.2*) than the size user want to sample,
+     * use limit to control the total sample size.
+     * @param sizeToRead The file size to sample.
+     * @param factor sizeToRead * factor = Table total size.
+     * @return True if need to limit.
+     */
+    protected boolean needLimit(long sizeToRead, double factor) {
+        long total = (long) (sizeToRead * factor);
+        long target;
+        if (tableSample.isPercent()) {
+            target = total * tableSample.getSampleValue() / 100;
+        } else {
+            int columnSize = 0;
+            for (Column column : table.getFullSchema()) {
+                columnSize += column.getDataType().getSlotSize();
+            }
+            target = columnSize * tableSample.getSampleValue();
+        }
+        if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
+            return true;
+        }
+        return false;
+    }
+}
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@ -17,12 +17,10 @@

 package org.apache.doris.statistics;

-import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.external.ExternalTable;
 import org.apache.doris.catalog.external.HMSExternalTable;
 import org.apache.doris.common.AnalysisException;
-import org.apache.doris.common.FeConstants;
-import org.apache.doris.common.Pair;
 import org.apache.doris.datasource.hive.HiveMetaStoreCache;
 import org.apache.doris.external.hive.util.HiveUtil;
 import org.apache.doris.statistics.util.StatisticsUtil;
@ -32,64 +30,36 @@ import org.apache.commons.text.StringSubstitutor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;

-import java.util.Collections;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
-import java.util.Random;
 import java.util.Set;

-public class HMSAnalysisTask extends BaseAnalysisTask {
+public class HMSAnalysisTask extends ExternalAnalysisTask {
    private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class);
+    private HMSExternalTable hmsExternalTable;

-    private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
-            + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
-    private boolean isTableLevelTask;
-    private boolean isPartitionOnly;
-    private HMSExternalTable table;
-
+    // for test
    public HMSAnalysisTask() {
    }

    public HMSAnalysisTask(AnalysisInfo info) {
        super(info);
-        isTableLevelTask = info.externalTableLevelTask;
-        isPartitionOnly = info.partitionOnly;
-        table = (HMSExternalTable) tbl;
+        hmsExternalTable = (HMSExternalTable) tbl;
    }

-    public void doExecute() throws Exception {
-        if (isTableLevelTask) {
-            getTableStats();
-        } else {
-            getTableColumnStats();
-        }
+    private boolean isPartitionColumn() {
+        return hmsExternalTable.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName()));
    }

    // For test
    protected void setTable(HMSExternalTable table) {
-        this.table = table;
+        setTable((ExternalTable) table);
+        this.hmsExternalTable = table;
    }

-    /**
-     * Get table row count
-     */
-    private void getTableStats() throws Exception {
-        Map<String, String> params = buildStatsParams(null);
-        List<ResultRow> columnResult =
-                StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
-                        .replace(ANALYZE_TABLE_COUNT_TEMPLATE));
-        String rowCount = columnResult.get(0).get(0);
-        Env.getCurrentEnv().getAnalysisManager()
-                .updateTableStatsStatus(
-                        new TableStatsMeta(Long.parseLong(rowCount), info, tbl));
-        job.rowCountDone(this);
-    }

-    /**
-     * Get column statistics and insert the result to __internal_schema.column_statistics
-     */
-    protected void getTableColumnStats() throws Exception {
+    @Override
+    protected void getOrdinaryColumnStats() throws Exception {
        if (!info.usingSqlForPartitionColumn) {
            try {
                if (isPartitionColumn()) {
@ -102,77 +72,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
                        + "fallback to normal collection",
                        isPartitionColumn() ? "partition " : "", col.getName(), e);
                /* retry using sql way! */
-                getOrdinaryColumnStats();
+                super.getOrdinaryColumnStats();
            }
        } else {
-            getOrdinaryColumnStats();
+            super.getOrdinaryColumnStats();
        }
    }

-    private boolean isPartitionColumn() {
-        return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName()));
-    }
-
-    // Get ordinary column stats. Ordinary column means not partition column.
-    private void getOrdinaryColumnStats() throws Exception {
-        StringBuilder sb = new StringBuilder();
-        Map<String, String> params = buildStatsParams("NULL");
-        params.put("min", getMinFunction());
-        params.put("max", getMaxFunction());
-        params.put("dataSizeFunction", getDataSizeFunction(col, false));
-        Pair<Double, Long> sampleInfo = getSampleInfo();
-        params.put("scaleFactor", String.valueOf(sampleInfo.first));
-        StringSubstitutor stringSubstitutor;
-        if (tableSample == null) {
-            // Do full analyze
-            LOG.debug("Will do full collection for column {}", col.getName());
-            sb.append(COLLECT_COL_STATISTICS);
-        } else {
-            // Do sample analyze
-            LOG.debug("Will do sample collection for column {}", col.getName());
-            boolean limitFlag = false;
-            boolean bucketFlag = false;
-            // If sample size is too large, use limit to control the sample size.
-            if (needLimit(sampleInfo.second, sampleInfo.first)) {
-                limitFlag = true;
-                long columnSize = 0;
-                for (Column column : table.getFullSchema()) {
-                    columnSize += column.getDataType().getSlotSize();
-                }
-                double targetRows = (double) sampleInfo.second / columnSize;
-                // Estimate the new scaleFactor based on the schema.
-                if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
-                    params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows());
-                    params.put("scaleFactor",
-                            String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
-                }
-            }
-            // Single distribution column is not fit for DUJ1 estimator, use linear estimator.
-            Set<String> distributionColumns = tbl.getDistributionColumnNames();
-            if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
-                bucketFlag = true;
-                sb.append(LINEAR_ANALYZE_TEMPLATE);
-                params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
-                params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
-            } else {
-                sb.append(DUJ1_ANALYZE_TEMPLATE);
-                params.put("dataSizeFunction", getDataSizeFunction(col, true));
-                params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
-                params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
-            }
-            LOG.info("Sample for column [{}]. Scale factor [{}], "
-                    + "limited [{}], is distribute column [{}]",
-                    col.getName(), params.get("scaleFactor"), limitFlag, bucketFlag);
-        }
-        stringSubstitutor = new StringSubstitutor(params);
-        String sql = stringSubstitutor.replace(sb.toString());
-        runQuery(sql);
-    }
-
    // Collect the partition column stats through HMS metadata.
    // Get all the partition values and calculate the stats based on the values.
    private void getPartitionColumnStats() throws Exception {
-        Set<String> partitionNames = table.getPartitionNames();
+        Set<String> partitionNames = hmsExternalTable.getPartitionNames();
        Set<String> ndvPartValues = Sets.newHashSet();
        long numNulls = 0;
        long dataSize = 0;
@ -198,8 +108,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
            }
        }
        // Estimate the row count. This value is inaccurate if the table stats is empty.
-        TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
-        long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount;
+        TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
+                .findTableStatsStatus(hmsExternalTable.getId());
+        long count = tableStatsStatus == null ? hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
        dataSize = dataSize * count / partitionNames.size();
        numNulls = numNulls * count / partitionNames.size();
        int ndv = ndvPartValues.size();
@ -218,8 +129,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {

    // Collect the spark analyzed column stats through HMS metadata.
    private void getHmsColumnStats() throws Exception {
-        TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
-        long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount;
+        TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
+                .findTableStatsStatus(hmsExternalTable.getId());
+        long count = tableStatsStatus == null ? hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;

        Map<String, String> params = buildStatsParams("NULL");
        Map<StatsType, String> statsParams = new HashMap<>();
@ -229,7 +141,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
        statsParams.put(StatsType.MAX_VALUE, "max");
        statsParams.put(StatsType.AVG_SIZE, "avg_len");

-        if (table.fillColumnStatistics(info.colName, statsParams, params)) {
+        if (hmsExternalTable.fillColumnStatistics(info.colName, statsParams, params)) {
            throw new AnalysisException("some column stats not available");
        }

@ -283,126 +195,4 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
        }
        return value.compareTo(currentMax) > 0 ? value : currentMax;
    }
-
-    private Map<String, String> buildStatsParams(String partId) {
-        Map<String, String> commonParams = new HashMap<>();
-        String id = StatisticsUtil.constructId(tbl.getId(), -1);
-        if (partId == null) {
-            commonParams.put("partId", "NULL");
-        } else {
-            id = StatisticsUtil.constructId(id, partId);
-            commonParams.put("partId", "\'" + partId + "\'");
-        }
-        commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
-        commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
-        commonParams.put("id", id);
-        commonParams.put("catalogId", String.valueOf(catalog.getId()));
-        commonParams.put("dbId", String.valueOf(db.getId()));
-        commonParams.put("tblId", String.valueOf(tbl.getId()));
-        commonParams.put("indexId", "-1");
-        commonParams.put("idxId", "-1");
-        commonParams.put("colName", info.colName);
-        commonParams.put("colId", info.colName);
-        commonParams.put("catalogName", catalog.getName());
-        commonParams.put("dbName", db.getFullName());
-        commonParams.put("tblName", tbl.getName());
-        commonParams.put("sampleHints", getSampleHint());
-        commonParams.put("limit", "");
-        commonParams.put("scaleFactor", "1");
-        if (col != null) {
-            commonParams.put("type", col.getType().toString());
-        }
-        commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis()));
-        return commonParams;
-    }
-
-    protected String getSampleHint() {
-        if (tableSample == null) {
-            return "";
-        }
-        if (tableSample.isPercent()) {
-            return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue());
-        } else {
-            return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue());
-        }
-    }
-
-    /**
-     * Get the pair of sample scale factor and the file size going to sample.
-     * While analyzing, the result of count, null count and data size need to
-     * multiply this scale factor to get more accurate result.
-     * @return Pair of sample scale factor and the file size going to sample.
-     */
-    protected Pair<Double, Long> getSampleInfo() {
-        if (tableSample == null) {
-            return Pair.of(1.0, 0L);
-        }
-        long target;
-        // Get list of all files' size in this HMS table.
-        List<Long> chunkSizes = table.getChunkSizes();
-        Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
-        long total = 0;
-        // Calculate the total size of this HMS table.
-        for (long size : chunkSizes) {
-            total += size;
-        }
-        if (total == 0) {
-            return Pair.of(1.0, 0L);
-        }
-        // Calculate the sample target size for percent and rows sample.
-        if (tableSample.isPercent()) {
-            target = total * tableSample.getSampleValue() / 100;
-        } else {
-            int columnSize = 0;
-            for (Column column : table.getFullSchema()) {
-                columnSize += column.getDataType().getSlotSize();
-            }
-            target = columnSize * tableSample.getSampleValue();
-        }
-        // Calculate the actual sample size (cumulate).
-        long cumulate = 0;
-        for (long size : chunkSizes) {
-            cumulate += size;
-            if (cumulate >= target) {
-                break;
-            }
-        }
-        return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
-    }
-
-    @Override
-    protected void afterExecution() {
-        // Table level task doesn't need to sync any value to sync stats, it stores the value in metadata.
-        // Partition only task doesn't need to refresh cached.
-        if (isTableLevelTask || isPartitionOnly) {
-            return;
-        }
-        Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName());
-    }
-
-    /**
-     * If the size to sample is larger than LIMIT_SIZE (1GB)
-     * and is much larger (1.2*) than the size user want to sample,
-     * use limit to control the total sample size.
-     * @param sizeToRead The file size to sample.
-     * @param factor sizeToRead * factor = Table total size.
-     * @return True if need to limit.
-     */
-    protected boolean needLimit(long sizeToRead, double factor) {
-        long total = (long) (sizeToRead * factor);
-        long target;
-        if (tableSample.isPercent()) {
-            target = total * tableSample.getSampleValue() / 100;
-        } else {
-            int columnSize = 0;
-            for (Column column : table.getFullSchema()) {
-                columnSize += column.getDataType().getSlotSize();
-            }
-            target = columnSize * tableSample.getSampleValue();
-        }
-        if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
-            return true;
-        }
-        return false;
-    }
 }
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@ -84,10 +84,12 @@ import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableScan;
+import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.types.Types;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;

+import java.io.IOException;
 import java.net.InetSocketAddress;
 import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
@ -735,8 +737,12 @@ public class StatisticsUtil {
        columnStatisticBuilder.setDataSize(0);
        columnStatisticBuilder.setAvgSizeByte(0);
        columnStatisticBuilder.setNumNulls(0);
-        for (FileScanTask task : tableScan.planFiles()) {
-            processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder);
+        try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
+            for (FileScanTask task : fileScanTasks) {
+                processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder);
+            }
+        } catch (IOException e) {
+            LOG.warn("Error to close FileScanTask.", e);
        }
        if (columnStatisticBuilder.getCount() > 0) {
            columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getDataSize()
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@ -252,7 +252,7 @@ public class HMSAnalysisTaskTest {
        analysisInfoBuilder.setUsingSqlForPartitionColumn(true);
        task.info = analysisInfoBuilder.build();

-        task.getTableColumnStats();
+        task.getOrdinaryColumnStats();
    }


@ -309,6 +309,6 @@ public class HMSAnalysisTaskTest {
        analysisInfoBuilder.setUsingSqlForPartitionColumn(false);
        task.info = analysisInfoBuilder.build();

-        task.getTableColumnStats();
+        task.getOrdinaryColumnStats();
    }
 }