[feature](statistics)support statistics for iceberg/paimon/hudi table (#29868)
This commit is contained in:
@ -23,7 +23,10 @@ import org.apache.doris.catalog.HiveMetaStoreClientHelper;
|
||||
import org.apache.doris.catalog.ScalarType;
|
||||
import org.apache.doris.catalog.Type;
|
||||
import org.apache.doris.datasource.iceberg.IcebergExternalCatalog;
|
||||
import org.apache.doris.statistics.AnalysisInfo;
|
||||
import org.apache.doris.statistics.BaseAnalysisTask;
|
||||
import org.apache.doris.statistics.ColumnStatistic;
|
||||
import org.apache.doris.statistics.ExternalAnalysisTask;
|
||||
import org.apache.doris.statistics.util.StatisticsUtil;
|
||||
import org.apache.doris.thrift.THiveTable;
|
||||
import org.apache.doris.thrift.TIcebergTable;
|
||||
@ -149,4 +152,10 @@ public class IcebergExternalTable extends ExternalTable {
|
||||
() -> StatisticsUtil.getIcebergColumnStats(colName,
|
||||
((IcebergExternalCatalog) catalog).getIcebergTable(dbName, name)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
|
||||
makeSureInitialized();
|
||||
return new ExternalAnalysisTask(info);
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,6 +21,9 @@ import org.apache.doris.catalog.Column;
|
||||
import org.apache.doris.catalog.ScalarType;
|
||||
import org.apache.doris.catalog.Type;
|
||||
import org.apache.doris.datasource.paimon.PaimonExternalCatalog;
|
||||
import org.apache.doris.statistics.AnalysisInfo;
|
||||
import org.apache.doris.statistics.BaseAnalysisTask;
|
||||
import org.apache.doris.statistics.ExternalAnalysisTask;
|
||||
import org.apache.doris.thrift.THiveTable;
|
||||
import org.apache.doris.thrift.TTableDescriptor;
|
||||
import org.apache.doris.thrift.TTableType;
|
||||
@ -154,4 +157,10 @@ public class PaimonExternalTable extends ExternalTable {
|
||||
+ getPaimonCatalogType());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
|
||||
makeSureInitialized();
|
||||
return new ExternalAnalysisTask(info);
|
||||
}
|
||||
}
|
||||
|
||||
@ -27,6 +27,7 @@ import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -37,6 +38,7 @@ import java.util.List;
|
||||
* show all catalogs' info
|
||||
*/
|
||||
public class CatalogsProcDir implements ProcDirInterface {
|
||||
private static final Logger LOG = Logger.getLogger(CatalogsProcDir.class);
|
||||
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
|
||||
.add("CatalogIds").add("CatalogName").add("DatabaseNum").add("LastUpdateTime")
|
||||
.build();
|
||||
@ -90,7 +92,13 @@ public class CatalogsProcDir implements ProcDirInterface {
|
||||
List<Comparable> catalogInfo = Lists.newArrayList();
|
||||
catalogInfo.add(catalog.getId());
|
||||
catalogInfo.add(catalog.getName());
|
||||
catalogInfo.add(catalog.getDbNames().size());
|
||||
int size = -1;
|
||||
try {
|
||||
size = catalog.getDbNames().size();
|
||||
} catch (Exception e) {
|
||||
LOG.warn("failed to get database: ", e);
|
||||
}
|
||||
catalogInfo.add(size);
|
||||
catalogInfo.add(TimeUtils.longToTimeString(catalog.getLastUpdateTime()));
|
||||
catalogInfos.add(catalogInfo);
|
||||
}
|
||||
|
||||
@ -0,0 +1,263 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.statistics;
|
||||
|
||||
import org.apache.doris.catalog.Column;
|
||||
import org.apache.doris.catalog.Env;
|
||||
import org.apache.doris.catalog.external.ExternalTable;
|
||||
import org.apache.doris.common.FeConstants;
|
||||
import org.apache.doris.common.Pair;
|
||||
import org.apache.doris.statistics.util.StatisticsUtil;
|
||||
|
||||
import org.apache.commons.text.StringSubstitutor;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
public class ExternalAnalysisTask extends BaseAnalysisTask {
|
||||
private static final Logger LOG = LogManager.getLogger(ExternalAnalysisTask.class);
|
||||
|
||||
private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
|
||||
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
|
||||
private boolean isTableLevelTask;
|
||||
private boolean isPartitionOnly;
|
||||
private ExternalTable table;
|
||||
|
||||
// For test
|
||||
public ExternalAnalysisTask() {
|
||||
}
|
||||
|
||||
public ExternalAnalysisTask(AnalysisInfo info) {
|
||||
super(info);
|
||||
isTableLevelTask = info.externalTableLevelTask;
|
||||
isPartitionOnly = info.partitionOnly;
|
||||
table = (ExternalTable) tbl;
|
||||
}
|
||||
|
||||
public void doExecute() throws Exception {
|
||||
if (isTableLevelTask) {
|
||||
getTableStats();
|
||||
} else {
|
||||
getOrdinaryColumnStats();
|
||||
}
|
||||
}
|
||||
|
||||
// For test
|
||||
protected void setTable(ExternalTable table) {
|
||||
this.table = table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get table row count
|
||||
*/
|
||||
private void getTableStats() {
|
||||
Map<String, String> params = buildStatsParams(null);
|
||||
List<ResultRow> columnResult =
|
||||
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
|
||||
.replace(ANALYZE_TABLE_COUNT_TEMPLATE));
|
||||
String rowCount = columnResult.get(0).get(0);
|
||||
Env.getCurrentEnv().getAnalysisManager()
|
||||
.updateTableStatsStatus(
|
||||
new TableStatsMeta(Long.parseLong(rowCount), info, tbl));
|
||||
job.rowCountDone(this);
|
||||
}
|
||||
|
||||
// Get ordinary column stats
|
||||
protected void getOrdinaryColumnStats() throws Exception {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Map<String, String> params = buildStatsParams("NULL");
|
||||
params.put("min", getMinFunction());
|
||||
params.put("max", getMaxFunction());
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, false));
|
||||
Pair<Double, Long> sampleInfo = getSampleInfo();
|
||||
params.put("scaleFactor", String.valueOf(sampleInfo.first));
|
||||
StringSubstitutor stringSubstitutor;
|
||||
if (tableSample == null) {
|
||||
// Do full analyze
|
||||
LOG.debug("Will do full collection for column {}", col.getName());
|
||||
sb.append(COLLECT_COL_STATISTICS);
|
||||
} else {
|
||||
// Do sample analyze
|
||||
LOG.debug("Will do sample collection for column {}", col.getName());
|
||||
boolean limitFlag = false;
|
||||
boolean bucketFlag = false;
|
||||
// If sample size is too large, use limit to control the sample size.
|
||||
if (needLimit(sampleInfo.second, sampleInfo.first)) {
|
||||
limitFlag = true;
|
||||
long columnSize = 0;
|
||||
for (Column column : table.getFullSchema()) {
|
||||
columnSize += column.getDataType().getSlotSize();
|
||||
}
|
||||
double targetRows = (double) sampleInfo.second / columnSize;
|
||||
// Estimate the new scaleFactor based on the schema.
|
||||
if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
|
||||
params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows());
|
||||
params.put("scaleFactor",
|
||||
String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
|
||||
}
|
||||
}
|
||||
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
|
||||
Set<String> distributionColumns = tbl.getDistributionColumnNames();
|
||||
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
|
||||
bucketFlag = true;
|
||||
sb.append(LINEAR_ANALYZE_TEMPLATE);
|
||||
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
|
||||
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
|
||||
} else {
|
||||
sb.append(DUJ1_ANALYZE_TEMPLATE);
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, true));
|
||||
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
|
||||
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
|
||||
}
|
||||
LOG.info("Sample for column [{}]. Scale factor [{}], "
|
||||
+ "limited [{}], is distribute column [{}]",
|
||||
col.getName(), params.get("scaleFactor"), limitFlag, bucketFlag);
|
||||
}
|
||||
stringSubstitutor = new StringSubstitutor(params);
|
||||
String sql = stringSubstitutor.replace(sb.toString());
|
||||
runQuery(sql);
|
||||
}
|
||||
|
||||
protected Map<String, String> buildStatsParams(String partId) {
|
||||
Map<String, String> commonParams = new HashMap<>();
|
||||
String id = StatisticsUtil.constructId(tbl.getId(), -1);
|
||||
if (partId == null) {
|
||||
commonParams.put("partId", "NULL");
|
||||
} else {
|
||||
id = StatisticsUtil.constructId(id, partId);
|
||||
commonParams.put("partId", "\'" + partId + "\'");
|
||||
}
|
||||
commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
|
||||
commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
|
||||
commonParams.put("id", id);
|
||||
commonParams.put("catalogId", String.valueOf(catalog.getId()));
|
||||
commonParams.put("dbId", String.valueOf(db.getId()));
|
||||
commonParams.put("tblId", String.valueOf(tbl.getId()));
|
||||
commonParams.put("indexId", "-1");
|
||||
commonParams.put("idxId", "-1");
|
||||
commonParams.put("colName", info.colName);
|
||||
commonParams.put("colId", info.colName);
|
||||
commonParams.put("catalogName", catalog.getName());
|
||||
commonParams.put("dbName", db.getFullName());
|
||||
commonParams.put("tblName", tbl.getName());
|
||||
commonParams.put("sampleHints", getSampleHint());
|
||||
commonParams.put("limit", "");
|
||||
commonParams.put("scaleFactor", "1");
|
||||
if (col != null) {
|
||||
commonParams.put("type", col.getType().toString());
|
||||
}
|
||||
commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis()));
|
||||
return commonParams;
|
||||
}
|
||||
|
||||
protected String getSampleHint() {
|
||||
if (tableSample == null) {
|
||||
return "";
|
||||
}
|
||||
if (tableSample.isPercent()) {
|
||||
return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue());
|
||||
} else {
|
||||
return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the pair of sample scale factor and the file size going to sample.
|
||||
* While analyzing, the result of count, null count and data size need to
|
||||
* multiply this scale factor to get more accurate result.
|
||||
* @return Pair of sample scale factor and the file size going to sample.
|
||||
*/
|
||||
protected Pair<Double, Long> getSampleInfo() {
|
||||
if (tableSample == null) {
|
||||
return Pair.of(1.0, 0L);
|
||||
}
|
||||
long target;
|
||||
// Get list of all files' size in this HMS table.
|
||||
List<Long> chunkSizes = table.getChunkSizes();
|
||||
Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
|
||||
long total = 0;
|
||||
// Calculate the total size of this HMS table.
|
||||
for (long size : chunkSizes) {
|
||||
total += size;
|
||||
}
|
||||
if (total == 0) {
|
||||
return Pair.of(1.0, 0L);
|
||||
}
|
||||
// Calculate the sample target size for percent and rows sample.
|
||||
if (tableSample.isPercent()) {
|
||||
target = total * tableSample.getSampleValue() / 100;
|
||||
} else {
|
||||
int columnSize = 0;
|
||||
for (Column column : table.getFullSchema()) {
|
||||
columnSize += column.getDataType().getSlotSize();
|
||||
}
|
||||
target = columnSize * tableSample.getSampleValue();
|
||||
}
|
||||
// Calculate the actual sample size (cumulate).
|
||||
long cumulate = 0;
|
||||
for (long size : chunkSizes) {
|
||||
cumulate += size;
|
||||
if (cumulate >= target) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void afterExecution() {
|
||||
// Table level task doesn't need to sync any value to sync stats, it stores the value in metadata.
|
||||
// Partition only task doesn't need to refresh cached.
|
||||
if (isTableLevelTask || isPartitionOnly) {
|
||||
return;
|
||||
}
|
||||
Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* If the size to sample is larger than LIMIT_SIZE (1GB)
|
||||
* and is much larger (1.2*) than the size user want to sample,
|
||||
* use limit to control the total sample size.
|
||||
* @param sizeToRead The file size to sample.
|
||||
* @param factor sizeToRead * factor = Table total size.
|
||||
* @return True if need to limit.
|
||||
*/
|
||||
protected boolean needLimit(long sizeToRead, double factor) {
|
||||
long total = (long) (sizeToRead * factor);
|
||||
long target;
|
||||
if (tableSample.isPercent()) {
|
||||
target = total * tableSample.getSampleValue() / 100;
|
||||
} else {
|
||||
int columnSize = 0;
|
||||
for (Column column : table.getFullSchema()) {
|
||||
columnSize += column.getDataType().getSlotSize();
|
||||
}
|
||||
target = columnSize * tableSample.getSampleValue();
|
||||
}
|
||||
if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -17,12 +17,10 @@
|
||||
|
||||
package org.apache.doris.statistics;
|
||||
|
||||
import org.apache.doris.catalog.Column;
|
||||
import org.apache.doris.catalog.Env;
|
||||
import org.apache.doris.catalog.external.ExternalTable;
|
||||
import org.apache.doris.catalog.external.HMSExternalTable;
|
||||
import org.apache.doris.common.AnalysisException;
|
||||
import org.apache.doris.common.FeConstants;
|
||||
import org.apache.doris.common.Pair;
|
||||
import org.apache.doris.datasource.hive.HiveMetaStoreCache;
|
||||
import org.apache.doris.external.hive.util.HiveUtil;
|
||||
import org.apache.doris.statistics.util.StatisticsUtil;
|
||||
@ -32,64 +30,36 @@ import org.apache.commons.text.StringSubstitutor;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
public class HMSAnalysisTask extends ExternalAnalysisTask {
|
||||
private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class);
|
||||
private HMSExternalTable hmsExternalTable;
|
||||
|
||||
private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
|
||||
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
|
||||
private boolean isTableLevelTask;
|
||||
private boolean isPartitionOnly;
|
||||
private HMSExternalTable table;
|
||||
|
||||
// for test
|
||||
public HMSAnalysisTask() {
|
||||
}
|
||||
|
||||
public HMSAnalysisTask(AnalysisInfo info) {
|
||||
super(info);
|
||||
isTableLevelTask = info.externalTableLevelTask;
|
||||
isPartitionOnly = info.partitionOnly;
|
||||
table = (HMSExternalTable) tbl;
|
||||
hmsExternalTable = (HMSExternalTable) tbl;
|
||||
}
|
||||
|
||||
public void doExecute() throws Exception {
|
||||
if (isTableLevelTask) {
|
||||
getTableStats();
|
||||
} else {
|
||||
getTableColumnStats();
|
||||
}
|
||||
private boolean isPartitionColumn() {
|
||||
return hmsExternalTable.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName()));
|
||||
}
|
||||
|
||||
// For test
|
||||
protected void setTable(HMSExternalTable table) {
|
||||
this.table = table;
|
||||
setTable((ExternalTable) table);
|
||||
this.hmsExternalTable = table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get table row count
|
||||
*/
|
||||
private void getTableStats() throws Exception {
|
||||
Map<String, String> params = buildStatsParams(null);
|
||||
List<ResultRow> columnResult =
|
||||
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
|
||||
.replace(ANALYZE_TABLE_COUNT_TEMPLATE));
|
||||
String rowCount = columnResult.get(0).get(0);
|
||||
Env.getCurrentEnv().getAnalysisManager()
|
||||
.updateTableStatsStatus(
|
||||
new TableStatsMeta(Long.parseLong(rowCount), info, tbl));
|
||||
job.rowCountDone(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get column statistics and insert the result to __internal_schema.column_statistics
|
||||
*/
|
||||
protected void getTableColumnStats() throws Exception {
|
||||
@Override
|
||||
protected void getOrdinaryColumnStats() throws Exception {
|
||||
if (!info.usingSqlForPartitionColumn) {
|
||||
try {
|
||||
if (isPartitionColumn()) {
|
||||
@ -102,77 +72,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
+ "fallback to normal collection",
|
||||
isPartitionColumn() ? "partition " : "", col.getName(), e);
|
||||
/* retry using sql way! */
|
||||
getOrdinaryColumnStats();
|
||||
super.getOrdinaryColumnStats();
|
||||
}
|
||||
} else {
|
||||
getOrdinaryColumnStats();
|
||||
super.getOrdinaryColumnStats();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isPartitionColumn() {
|
||||
return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName()));
|
||||
}
|
||||
|
||||
// Get ordinary column stats. Ordinary column means not partition column.
|
||||
private void getOrdinaryColumnStats() throws Exception {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Map<String, String> params = buildStatsParams("NULL");
|
||||
params.put("min", getMinFunction());
|
||||
params.put("max", getMaxFunction());
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, false));
|
||||
Pair<Double, Long> sampleInfo = getSampleInfo();
|
||||
params.put("scaleFactor", String.valueOf(sampleInfo.first));
|
||||
StringSubstitutor stringSubstitutor;
|
||||
if (tableSample == null) {
|
||||
// Do full analyze
|
||||
LOG.debug("Will do full collection for column {}", col.getName());
|
||||
sb.append(COLLECT_COL_STATISTICS);
|
||||
} else {
|
||||
// Do sample analyze
|
||||
LOG.debug("Will do sample collection for column {}", col.getName());
|
||||
boolean limitFlag = false;
|
||||
boolean bucketFlag = false;
|
||||
// If sample size is too large, use limit to control the sample size.
|
||||
if (needLimit(sampleInfo.second, sampleInfo.first)) {
|
||||
limitFlag = true;
|
||||
long columnSize = 0;
|
||||
for (Column column : table.getFullSchema()) {
|
||||
columnSize += column.getDataType().getSlotSize();
|
||||
}
|
||||
double targetRows = (double) sampleInfo.second / columnSize;
|
||||
// Estimate the new scaleFactor based on the schema.
|
||||
if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
|
||||
params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows());
|
||||
params.put("scaleFactor",
|
||||
String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
|
||||
}
|
||||
}
|
||||
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
|
||||
Set<String> distributionColumns = tbl.getDistributionColumnNames();
|
||||
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
|
||||
bucketFlag = true;
|
||||
sb.append(LINEAR_ANALYZE_TEMPLATE);
|
||||
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
|
||||
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
|
||||
} else {
|
||||
sb.append(DUJ1_ANALYZE_TEMPLATE);
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, true));
|
||||
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
|
||||
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
|
||||
}
|
||||
LOG.info("Sample for column [{}]. Scale factor [{}], "
|
||||
+ "limited [{}], is distribute column [{}]",
|
||||
col.getName(), params.get("scaleFactor"), limitFlag, bucketFlag);
|
||||
}
|
||||
stringSubstitutor = new StringSubstitutor(params);
|
||||
String sql = stringSubstitutor.replace(sb.toString());
|
||||
runQuery(sql);
|
||||
}
|
||||
|
||||
// Collect the partition column stats through HMS metadata.
|
||||
// Get all the partition values and calculate the stats based on the values.
|
||||
private void getPartitionColumnStats() throws Exception {
|
||||
Set<String> partitionNames = table.getPartitionNames();
|
||||
Set<String> partitionNames = hmsExternalTable.getPartitionNames();
|
||||
Set<String> ndvPartValues = Sets.newHashSet();
|
||||
long numNulls = 0;
|
||||
long dataSize = 0;
|
||||
@ -198,8 +108,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
}
|
||||
}
|
||||
// Estimate the row count. This value is inaccurate if the table stats is empty.
|
||||
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
|
||||
long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount;
|
||||
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
|
||||
.findTableStatsStatus(hmsExternalTable.getId());
|
||||
long count = tableStatsStatus == null ? hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
|
||||
dataSize = dataSize * count / partitionNames.size();
|
||||
numNulls = numNulls * count / partitionNames.size();
|
||||
int ndv = ndvPartValues.size();
|
||||
@ -218,8 +129,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
|
||||
// Collect the spark analyzed column stats through HMS metadata.
|
||||
private void getHmsColumnStats() throws Exception {
|
||||
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
|
||||
long count = tableStatsStatus == null ? table.estimatedRowCount() : tableStatsStatus.rowCount;
|
||||
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
|
||||
.findTableStatsStatus(hmsExternalTable.getId());
|
||||
long count = tableStatsStatus == null ? hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
|
||||
|
||||
Map<String, String> params = buildStatsParams("NULL");
|
||||
Map<StatsType, String> statsParams = new HashMap<>();
|
||||
@ -229,7 +141,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
statsParams.put(StatsType.MAX_VALUE, "max");
|
||||
statsParams.put(StatsType.AVG_SIZE, "avg_len");
|
||||
|
||||
if (table.fillColumnStatistics(info.colName, statsParams, params)) {
|
||||
if (hmsExternalTable.fillColumnStatistics(info.colName, statsParams, params)) {
|
||||
throw new AnalysisException("some column stats not available");
|
||||
}
|
||||
|
||||
@ -283,126 +195,4 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
}
|
||||
return value.compareTo(currentMax) > 0 ? value : currentMax;
|
||||
}
|
||||
|
||||
private Map<String, String> buildStatsParams(String partId) {
|
||||
Map<String, String> commonParams = new HashMap<>();
|
||||
String id = StatisticsUtil.constructId(tbl.getId(), -1);
|
||||
if (partId == null) {
|
||||
commonParams.put("partId", "NULL");
|
||||
} else {
|
||||
id = StatisticsUtil.constructId(id, partId);
|
||||
commonParams.put("partId", "\'" + partId + "\'");
|
||||
}
|
||||
commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
|
||||
commonParams.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
|
||||
commonParams.put("id", id);
|
||||
commonParams.put("catalogId", String.valueOf(catalog.getId()));
|
||||
commonParams.put("dbId", String.valueOf(db.getId()));
|
||||
commonParams.put("tblId", String.valueOf(tbl.getId()));
|
||||
commonParams.put("indexId", "-1");
|
||||
commonParams.put("idxId", "-1");
|
||||
commonParams.put("colName", info.colName);
|
||||
commonParams.put("colId", info.colName);
|
||||
commonParams.put("catalogName", catalog.getName());
|
||||
commonParams.put("dbName", db.getFullName());
|
||||
commonParams.put("tblName", tbl.getName());
|
||||
commonParams.put("sampleHints", getSampleHint());
|
||||
commonParams.put("limit", "");
|
||||
commonParams.put("scaleFactor", "1");
|
||||
if (col != null) {
|
||||
commonParams.put("type", col.getType().toString());
|
||||
}
|
||||
commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis()));
|
||||
return commonParams;
|
||||
}
|
||||
|
||||
protected String getSampleHint() {
|
||||
if (tableSample == null) {
|
||||
return "";
|
||||
}
|
||||
if (tableSample.isPercent()) {
|
||||
return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue());
|
||||
} else {
|
||||
return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the pair of sample scale factor and the file size going to sample.
|
||||
* While analyzing, the result of count, null count and data size need to
|
||||
* multiply this scale factor to get more accurate result.
|
||||
* @return Pair of sample scale factor and the file size going to sample.
|
||||
*/
|
||||
protected Pair<Double, Long> getSampleInfo() {
|
||||
if (tableSample == null) {
|
||||
return Pair.of(1.0, 0L);
|
||||
}
|
||||
long target;
|
||||
// Get list of all files' size in this HMS table.
|
||||
List<Long> chunkSizes = table.getChunkSizes();
|
||||
Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
|
||||
long total = 0;
|
||||
// Calculate the total size of this HMS table.
|
||||
for (long size : chunkSizes) {
|
||||
total += size;
|
||||
}
|
||||
if (total == 0) {
|
||||
return Pair.of(1.0, 0L);
|
||||
}
|
||||
// Calculate the sample target size for percent and rows sample.
|
||||
if (tableSample.isPercent()) {
|
||||
target = total * tableSample.getSampleValue() / 100;
|
||||
} else {
|
||||
int columnSize = 0;
|
||||
for (Column column : table.getFullSchema()) {
|
||||
columnSize += column.getDataType().getSlotSize();
|
||||
}
|
||||
target = columnSize * tableSample.getSampleValue();
|
||||
}
|
||||
// Calculate the actual sample size (cumulate).
|
||||
long cumulate = 0;
|
||||
for (long size : chunkSizes) {
|
||||
cumulate += size;
|
||||
if (cumulate >= target) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void afterExecution() {
|
||||
// Table level task doesn't need to sync any value to sync stats, it stores the value in metadata.
|
||||
// Partition only task doesn't need to refresh cached.
|
||||
if (isTableLevelTask || isPartitionOnly) {
|
||||
return;
|
||||
}
|
||||
Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* If the size to sample is larger than LIMIT_SIZE (1GB)
|
||||
* and is much larger (1.2*) than the size user want to sample,
|
||||
* use limit to control the total sample size.
|
||||
* @param sizeToRead The file size to sample.
|
||||
* @param factor sizeToRead * factor = Table total size.
|
||||
* @return True if need to limit.
|
||||
*/
|
||||
protected boolean needLimit(long sizeToRead, double factor) {
|
||||
long total = (long) (sizeToRead * factor);
|
||||
long target;
|
||||
if (tableSample.isPercent()) {
|
||||
target = total * tableSample.getSampleValue() / 100;
|
||||
} else {
|
||||
int columnSize = 0;
|
||||
for (Column column : table.getFullSchema()) {
|
||||
columnSize += column.getDataType().getSlotSize();
|
||||
}
|
||||
target = columnSize * tableSample.getSampleValue();
|
||||
}
|
||||
if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,10 +84,12 @@ import org.apache.iceberg.FileScanTask;
|
||||
import org.apache.iceberg.PartitionSpec;
|
||||
import org.apache.iceberg.Table;
|
||||
import org.apache.iceberg.TableScan;
|
||||
import org.apache.iceberg.io.CloseableIterable;
|
||||
import org.apache.iceberg.types.Types;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.SimpleDateFormat;
|
||||
@ -735,8 +737,12 @@ public class StatisticsUtil {
|
||||
columnStatisticBuilder.setDataSize(0);
|
||||
columnStatisticBuilder.setAvgSizeByte(0);
|
||||
columnStatisticBuilder.setNumNulls(0);
|
||||
for (FileScanTask task : tableScan.planFiles()) {
|
||||
processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder);
|
||||
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
|
||||
for (FileScanTask task : fileScanTasks) {
|
||||
processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Error to close FileScanTask.", e);
|
||||
}
|
||||
if (columnStatisticBuilder.getCount() > 0) {
|
||||
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getDataSize()
|
||||
|
||||
@ -252,7 +252,7 @@ public class HMSAnalysisTaskTest {
|
||||
analysisInfoBuilder.setUsingSqlForPartitionColumn(true);
|
||||
task.info = analysisInfoBuilder.build();
|
||||
|
||||
task.getTableColumnStats();
|
||||
task.getOrdinaryColumnStats();
|
||||
}
|
||||
|
||||
|
||||
@ -309,6 +309,6 @@ public class HMSAnalysisTaskTest {
|
||||
analysisInfoBuilder.setUsingSqlForPartitionColumn(false);
|
||||
task.info = analysisInfoBuilder.build();
|
||||
|
||||
task.getTableColumnStats();
|
||||
task.getOrdinaryColumnStats();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user