[feature](statistics)support statistics for iceberg/paimon/hudi table (#29868)

This commit is contained in:
wuwenchi
2024-01-17 17:43:05 +08:00
committed by yiguolei
parent ade720470d
commit 44ba9e102c
21 changed files with 621 additions and 237 deletions

View File

@ -23,7 +23,10 @@ import org.apache.doris.catalog.HiveMetaStoreClientHelper;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.datasource.iceberg.IcebergExternalCatalog;
import org.apache.doris.statistics.AnalysisInfo;
import org.apache.doris.statistics.BaseAnalysisTask;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ExternalAnalysisTask;
import org.apache.doris.statistics.util.StatisticsUtil;
import org.apache.doris.thrift.THiveTable;
import org.apache.doris.thrift.TIcebergTable;
@ -149,4 +152,10 @@ public class IcebergExternalTable extends ExternalTable {
() -> StatisticsUtil.getIcebergColumnStats(colName,
((IcebergExternalCatalog) catalog).getIcebergTable(dbName, name)));
}
@Override
public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
makeSureInitialized();
return new ExternalAnalysisTask(info);
}
}

View File

@ -21,6 +21,9 @@ import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.datasource.paimon.PaimonExternalCatalog;
import org.apache.doris.statistics.AnalysisInfo;
import org.apache.doris.statistics.BaseAnalysisTask;
import org.apache.doris.statistics.ExternalAnalysisTask;
import org.apache.doris.thrift.THiveTable;
import org.apache.doris.thrift.TTableDescriptor;
import org.apache.doris.thrift.TTableType;
@ -154,4 +157,10 @@ public class PaimonExternalTable extends ExternalTable {
+ getPaimonCatalogType());
}
}
@Override
public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
makeSureInitialized();
return new ExternalAnalysisTask(info);
}
}

View File

@ -27,6 +27,7 @@ import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.Collections;
@ -37,6 +38,7 @@ import java.util.List;
* show all catalogs' info
*/
public class CatalogsProcDir implements ProcDirInterface {
private static final Logger LOG = Logger.getLogger(CatalogsProcDir.class);
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("CatalogIds").add("CatalogName").add("DatabaseNum").add("LastUpdateTime")
.build();
@ -90,7 +92,13 @@ public class CatalogsProcDir implements ProcDirInterface {
List<Comparable> catalogInfo = Lists.newArrayList();
catalogInfo.add(catalog.getId());
catalogInfo.add(catalog.getName());
catalogInfo.add(catalog.getDbNames().size());
int size = -1;
try {
size = catalog.getDbNames().size();
} catch (Exception e) {
LOG.warn("failed to get database: ", e);
}
catalogInfo.add(size);
catalogInfo.add(TimeUtils.longToTimeString(catalog.getLastUpdateTime()));
catalogInfos.add(catalogInfo);
}

View File

@ -0,0 +1,263 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.statistics;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.external.ExternalTable;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.statistics.util.StatisticsUtil;
import org.apache.commons.text.StringSubstitutor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
public class ExternalAnalysisTask extends BaseAnalysisTask {
private static final Logger LOG = LogManager.getLogger(ExternalAnalysisTask.class);
private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
private boolean isTableLevelTask;
private boolean isPartitionOnly;
private ExternalTable table;
// For test
public ExternalAnalysisTask() {
}
public ExternalAnalysisTask(AnalysisInfo info) {
super(info);
isTableLevelTask = info.externalTableLevelTask;
isPartitionOnly = info.partitionOnly;
table = (ExternalTable) tbl;
}
public void doExecute() throws Exception {
if (isTableLevelTask) {
getTableStats();
} else {
getOrdinaryColumnStats();
}
}
// For test
protected void setTable(ExternalTable table) {
this.table = table;
}
/**
* Get table row count
*/
private void getTableStats() {
Map<String, String> params = buildStatsParams(null);
List<ResultRow> columnResult =
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
.replace(ANALYZE_TABLE_COUNT_TEMPLATE));
String rowCount = columnResult.get(0).get(0);
Env.getCurrentEnv().getAnalysisManager()
.updateTableStatsStatus(
new TableStatsMeta(Long.parseLong(rowCount), info, tbl));
job.rowCountDone(this);
}
// Get ordinary column stats
protected void getOrdinaryColumnStats() throws Exception {
StringBuilder sb = new StringBuilder();
Map<String, String> params = buildStatsParams("NULL");
params.put("min", getMinFunction());
params.put("max", getMaxFunction());
params.put("dataSizeFunction", getDataSizeFunction(col, false));
Pair<Double, Long> sampleInfo = getSampleInfo();
params.put("scaleFactor", String.valueOf(sampleInfo.first));
StringSubstitutor stringSubstitutor;
if (tableSample == null) {
// Do full analyze
LOG.debug("Will do full collection for column {}", col.getName());
sb.append(COLLECT_COL_STATISTICS);
} else {
// Do sample analyze
LOG.debug("Will do sample collection for column {}", col.getName());
boolean limitFlag = false;
boolean bucketFlag = false;
// If sample size is too large, use limit to control the sample size.
if (needLimit(sampleInfo.second, sampleInfo.first)) {
limitFlag = true;
long columnSize = 0;
for (Column column : table.getFullSchema()) {
columnSize += column.getDataType().getSlotSize();
}
double targetRows = (double) sampleInfo.second / columnSize;
// Estimate the new scaleFactor based on the schema.
if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows());
params.put("scaleFactor",
String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
}
}
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
Set<String> distributionColumns = tbl.getDistributionColumnNames();
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
params.put("dataSizeFunction", getDataSizeFunction(col, true));
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
}
LOG.info("Sample for column [{}]. Scale factor [{}], "
+ "limited [{}], is distribute column [{}]",
col.getName(), params.get("scaleFactor"), limitFlag, bucketFlag);
}
stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
runQuery(sql);
}
protected Map<String, String> buildStatsParams(String partId) {
Map<String, String> commonParams = new HashMap<>();
String id = StatisticsUtil.constructId(tbl.getId(), -1);
if (partId == null) {
commonParams.put("partId", "NULL");
} else {
id = StatisticsUtil.constructId(id, partId);
commonParams.put("partId", "\'" + partId + "\'");
}
commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
commonParams.put("id", id);
commonParams.put("catalogId", String.valueOf(catalog.getId()));
commonParams.put("dbId", String.valueOf(db.getId()));
commonParams.put("tblId", String.valueOf(tbl.getId()));
commonParams.put("indexId", "-1");
commonParams.put("idxId", "-1");
commonParams.put("colName", info.colName);
commonParams.put("colId", info.colName);
commonParams.put("catalogName", catalog.getName());
commonParams.put("dbName", db.getFullName());
commonParams.put("tblName", tbl.getName());
commonParams.put("sampleHints", getSampleHint());
commonParams.put("limit", "");
commonParams.put("scaleFactor", "1");
if (col != null) {
commonParams.put("type", col.getType().toString());
}
commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis()));
return commonParams;
}
protected String getSampleHint() {
if (tableSample == null) {
return "";
}
if (tableSample.isPercent()) {
return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue());
} else {
return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue());
}
}
/**
* Get the pair of sample scale factor and the file size going to sample.
* While analyzing, the result of count, null count and data size need to
* multiply this scale factor to get more accurate result.
* @return Pair of sample scale factor and the file size going to sample.
*/
protected Pair<Double, Long> getSampleInfo() {
if (tableSample == null) {
return Pair.of(1.0, 0L);
}
long target;
// Get list of all files' size in this HMS table.
List<Long> chunkSizes = table.getChunkSizes();
Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
long total = 0;
// Calculate the total size of this HMS table.
for (long size : chunkSizes) {
total += size;
}
if (total == 0) {
return Pair.of(1.0, 0L);
}
// Calculate the sample target size for percent and rows sample.
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;
} else {
int columnSize = 0;
for (Column column : table.getFullSchema()) {
columnSize += column.getDataType().getSlotSize();
}
target = columnSize * tableSample.getSampleValue();
}
// Calculate the actual sample size (cumulate).
long cumulate = 0;
for (long size : chunkSizes) {
cumulate += size;
if (cumulate >= target) {
break;
}
}
return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
}
@Override
protected void afterExecution() {
// Table level task doesn't need to sync any value to sync stats, it stores the value in metadata.
// Partition only task doesn't need to refresh cached.
if (isTableLevelTask || isPartitionOnly) {
return;
}
Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName());
}
/**
* If the size to sample is larger than LIMIT_SIZE (1GB)
* and is much larger (1.2*) than the size user want to sample,
* use limit to control the total sample size.
* @param sizeToRead The file size to sample.
* @param factor sizeToRead * factor = Table total size.
* @return True if need to limit.
*/
protected boolean needLimit(long sizeToRead, double factor) {
long total = (long) (sizeToRead * factor);
long target;
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;
} else {
int columnSize = 0;
for (Column column : table.getFullSchema()) {
columnSize += column.getDataType().getSlotSize();
}
target = columnSize * tableSample.getSampleValue();
}
if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
return true;
}
return false;
}
}

View File

@ -17,12 +17,10 @@
package org.apache.doris.statistics;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.external.ExternalTable;
import org.apache.doris.catalog.external.HMSExternalTable;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.datasource.hive.HiveMetaStoreCache;
import org.apache.doris.external.hive.util.HiveUtil;
import org.apache.doris.statistics.util.StatisticsUtil;
@ -32,64 +30,36 @@ import org.apache.commons.text.StringSubstitutor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
public class HMSAnalysisTask extends BaseAnalysisTask {
public class HMSAnalysisTask extends ExternalAnalysisTask {
private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class);
private HMSExternalTable hmsExternalTable;
private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
private boolean isTableLevelTask;
private boolean isPartitionOnly;
private HMSExternalTable table;
// for test
public HMSAnalysisTask() {
}
public HMSAnalysisTask(AnalysisInfo info) {
super(info);
isTableLevelTask = info.externalTableLevelTask;
isPartitionOnly = info.partitionOnly;
table = (HMSExternalTable) tbl;
hmsExternalTable = (HMSExternalTable) tbl;
}
public void doExecute() throws Exception {
if (isTableLevelTask) {
getTableStats();
} else {
getTableColumnStats();
}
private boolean isPartitionColumn() {
return hmsExternalTable.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName()));
}
// For test
protected void setTable(HMSExternalTable table) {
this.table = table;
setTable((ExternalTable) table);
this.hmsExternalTable = table;
}
/**
* Get table row count
*/
private void getTableStats() throws Exception {
Map<String, String> params = buildStatsParams(null);
List<ResultRow> columnResult =
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
.replace(ANALYZE_TABLE_COUNT_TEMPLATE));
String rowCount = columnResult.get(0).get(0);
Env.getCurrentEnv().getAnalysisManager()
.updateTableStatsStatus(
new TableStatsMeta(Long.parseLong(rowCount), info, tbl));
job.rowCountDone(this);
}
/**
* Get column statistics and insert the result to __internal_schema.column_statistics
*/
protected void getTableColumnStats() throws Exception {
@Override
protected void getOrdinaryColumnStats() throws Exception {
if (!info.usingSqlForPartitionColumn) {
try {
if (isPartitionColumn()) {
@ -102,77 +72,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "fallback to normal collection",
isPartitionColumn() ? "partition " : "", col.getName(), e);
/* retry using sql way! */
getOrdinaryColumnStats();
super.getOrdinaryColumnStats();
}
} else {
getOrdinaryColumnStats();
super.getOrdinaryColumnStats();
}
}
private boolean isPartitionColumn() {
return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName()));
}
// Get ordinary column stats. Ordinary column means not partition column.
private void getOrdinaryColumnStats() throws Exception {
StringBuilder sb = new StringBuilder();
Map<String, String> params = buildStatsParams("NULL");
params.put("min", getMinFunction());
params.put("max", getMaxFunction());
params.put("dataSizeFunction", getDataSizeFunction(col, false));
Pair<Double, Long> sampleInfo = getSampleInfo();
params.put("scaleFactor", String.valueOf(sampleInfo.first));
StringSubstitutor stringSubstitutor;
if (tableSample == null) {
// Do full analyze
LOG.debug("Will do full collection for column {}", col.getName());
sb.append(COLLECT_COL_STATISTICS);
} else {
// Do sample analyze
LOG.debug("Will do sample collection for column {}", col.getName());
boolean limitFlag = false;
boolean bucketFlag = false;
// If sample size is too large, use limit to control the sample size.
if (needLimit(sampleInfo.second, sampleInfo.first)) {
limitFlag = true;
long columnSize = 0;
for (Column column : table.getFullSchema()) {
columnSize += column.getDataType().getSlotSize();
}
double targetRows = (double) sampleInfo.second / columnSize;
// Estimate the new scaleFactor based on the schema.
if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows());
params.put("scaleFactor",
String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
}
}
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
Set<String> distributionColumns = tbl.getDistributionColumnNames();
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
params.put("dataSizeFunction", getDataSizeFunction(col, true));
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
}
LOG.info("Sample for column [{}]. Scale factor [{}], "
+ "limited [{}], is distribute column [{}]",
col.getName(), params.get("scaleFactor"), limitFlag, bucketFlag);
}
stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
runQuery(sql);
}
// Collect the partition column stats through HMS metadata.
// Get all the partition values and calculate the stats based on the values.
private void getPartitionColumnStats() throws Exception {
Set<String> partitionNames = table.getPartitionNames();
Set<String> partitionNames = hmsExternalTable.getPartitionNames();
Set<String> ndvPartValues = Sets.newHashSet();
long numNulls = 0;
long dataSize = 0;
@ -198,8 +108,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
}
}
// Estimate the row count. This value is inaccurate if the table stats is empty.
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount;
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
.findTableStatsStatus(hmsExternalTable.getId());
long count = tableStatsStatus == null ? hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
dataSize = dataSize * count / partitionNames.size();
numNulls = numNulls * count / partitionNames.size();
int ndv = ndvPartValues.size();
@ -218,8 +129,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
// Collect the spark analyzed column stats through HMS metadata.
private void getHmsColumnStats() throws Exception {
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount;
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
.findTableStatsStatus(hmsExternalTable.getId());
long count = tableStatsStatus == null ? hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
Map<String, String> params = buildStatsParams("NULL");
Map<StatsType, String> statsParams = new HashMap<>();
@ -229,7 +141,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
statsParams.put(StatsType.MAX_VALUE, "max");
statsParams.put(StatsType.AVG_SIZE, "avg_len");
if (table.fillColumnStatistics(info.colName, statsParams, params)) {
if (hmsExternalTable.fillColumnStatistics(info.colName, statsParams, params)) {
throw new AnalysisException("some column stats not available");
}
@ -283,126 +195,4 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
}
return value.compareTo(currentMax) > 0 ? value : currentMax;
}
private Map<String, String> buildStatsParams(String partId) {
Map<String, String> commonParams = new HashMap<>();
String id = StatisticsUtil.constructId(tbl.getId(), -1);
if (partId == null) {
commonParams.put("partId", "NULL");
} else {
id = StatisticsUtil.constructId(id, partId);
commonParams.put("partId", "\'" + partId + "\'");
}
commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
commonParams.put("id", id);
commonParams.put("catalogId", String.valueOf(catalog.getId()));
commonParams.put("dbId", String.valueOf(db.getId()));
commonParams.put("tblId", String.valueOf(tbl.getId()));
commonParams.put("indexId", "-1");
commonParams.put("idxId", "-1");
commonParams.put("colName", info.colName);
commonParams.put("colId", info.colName);
commonParams.put("catalogName", catalog.getName());
commonParams.put("dbName", db.getFullName());
commonParams.put("tblName", tbl.getName());
commonParams.put("sampleHints", getSampleHint());
commonParams.put("limit", "");
commonParams.put("scaleFactor", "1");
if (col != null) {
commonParams.put("type", col.getType().toString());
}
commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis()));
return commonParams;
}
protected String getSampleHint() {
if (tableSample == null) {
return "";
}
if (tableSample.isPercent()) {
return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue());
} else {
return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue());
}
}
/**
* Get the pair of sample scale factor and the file size going to sample.
* While analyzing, the result of count, null count and data size need to
* multiply this scale factor to get more accurate result.
* @return Pair of sample scale factor and the file size going to sample.
*/
protected Pair<Double, Long> getSampleInfo() {
if (tableSample == null) {
return Pair.of(1.0, 0L);
}
long target;
// Get list of all files' size in this HMS table.
List<Long> chunkSizes = table.getChunkSizes();
Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
long total = 0;
// Calculate the total size of this HMS table.
for (long size : chunkSizes) {
total += size;
}
if (total == 0) {
return Pair.of(1.0, 0L);
}
// Calculate the sample target size for percent and rows sample.
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;
} else {
int columnSize = 0;
for (Column column : table.getFullSchema()) {
columnSize += column.getDataType().getSlotSize();
}
target = columnSize * tableSample.getSampleValue();
}
// Calculate the actual sample size (cumulate).
long cumulate = 0;
for (long size : chunkSizes) {
cumulate += size;
if (cumulate >= target) {
break;
}
}
return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
}
@Override
protected void afterExecution() {
// Table level task doesn't need to sync any value to sync stats, it stores the value in metadata.
// Partition only task doesn't need to refresh cached.
if (isTableLevelTask || isPartitionOnly) {
return;
}
Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName());
}
/**
* If the size to sample is larger than LIMIT_SIZE (1GB)
* and is much larger (1.2*) than the size user want to sample,
* use limit to control the total sample size.
* @param sizeToRead The file size to sample.
* @param factor sizeToRead * factor = Table total size.
* @return True if need to limit.
*/
protected boolean needLimit(long sizeToRead, double factor) {
long total = (long) (sizeToRead * factor);
long target;
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;
} else {
int columnSize = 0;
for (Column column : table.getFullSchema()) {
columnSize += column.getDataType().getSlotSize();
}
target = columnSize * tableSample.getSampleValue();
}
if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
return true;
}
return false;
}
}

View File

@ -84,10 +84,12 @@ import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableScan;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.types.Types;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
@ -735,8 +737,12 @@ public class StatisticsUtil {
columnStatisticBuilder.setDataSize(0);
columnStatisticBuilder.setAvgSizeByte(0);
columnStatisticBuilder.setNumNulls(0);
for (FileScanTask task : tableScan.planFiles()) {
processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder);
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask task : fileScanTasks) {
processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder);
}
} catch (IOException e) {
LOG.warn("Error to close FileScanTask.", e);
}
if (columnStatisticBuilder.getCount() > 0) {
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getDataSize()

View File

@ -252,7 +252,7 @@ public class HMSAnalysisTaskTest {
analysisInfoBuilder.setUsingSqlForPartitionColumn(true);
task.info = analysisInfoBuilder.build();
task.getTableColumnStats();
task.getOrdinaryColumnStats();
}
@ -309,6 +309,6 @@ public class HMSAnalysisTaskTest {
analysisInfoBuilder.setUsingSqlForPartitionColumn(false);
task.info = analysisInfoBuilder.build();
task.getTableColumnStats();
task.getOrdinaryColumnStats();
}
}