[fix](statistics)Fix hms external table get row count bug while analyze (#31557)

asdasd
This commit is contained in:
Jibing-Li
2024-02-29 11:22:58 +08:00
committed by yiguolei
parent 17359d59a3
commit 6ef3455786
3 changed files with 21 additions and 19 deletions

View File

@ -59,7 +59,7 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
if (isTableLevelTask) {
getTableStats();
} else {
getOrdinaryColumnStats();
getColumnStats();
}
}
@ -83,8 +83,8 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
job.rowCountDone(this);
}
// Get ordinary column stats
protected void getOrdinaryColumnStats() throws Exception {
// Get column stats
protected void getColumnStats() throws Exception {
StringBuilder sb = new StringBuilder();
Map<String, String> params = buildStatsParams("NULL");
params.put("min", getMinFunction());

View File

@ -17,7 +17,6 @@
package org.apache.doris.statistics;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.datasource.ExternalTable;
import org.apache.doris.datasource.hive.HMSExternalTable;
@ -59,8 +58,10 @@ public class HMSAnalysisTask extends ExternalAnalysisTask {
@Override
protected void getOrdinaryColumnStats() throws Exception {
if (!info.usingSqlForPartitionColumn) {
protected void getColumnStats() throws Exception {
if (info.usingSqlForPartitionColumn) {
super.getColumnStats();
} else {
try {
if (isPartitionColumn()) {
getPartitionColumnStats();
@ -72,10 +73,8 @@ public class HMSAnalysisTask extends ExternalAnalysisTask {
+ "fallback to normal collection",
isPartitionColumn() ? "partition " : "", col.getName(), e);
/* retry using sql way! */
super.getOrdinaryColumnStats();
super.getColumnStats();
}
} else {
super.getOrdinaryColumnStats();
}
}
@ -107,10 +106,11 @@ public class HMSAnalysisTask extends ExternalAnalysisTask {
}
}
}
// Estimate the row count. This value is inaccurate if the table stats is empty.
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
.findTableStatsStatus(hmsExternalTable.getId());
long count = tableStatsStatus == null ? hmsExternalTable.getRowCount() : tableStatsStatus.rowCount;
// getRowCount may return 0 if cache is empty, in this case, call fetchRowCount.
long count = hmsExternalTable.getRowCount();
if (count == 0) {
count = hmsExternalTable.fetchRowCount();
}
dataSize = dataSize * count / partitionNames.size();
numNulls = numNulls * count / partitionNames.size();
int ndv = ndvPartValues.size();
@ -129,9 +129,11 @@ public class HMSAnalysisTask extends ExternalAnalysisTask {
// Collect the spark analyzed column stats through HMS metadata.
private void getHmsColumnStats() throws Exception {
TableStatsMeta tableStatsStatus = Env.getCurrentEnv().getAnalysisManager()
.findTableStatsStatus(hmsExternalTable.getId());
long count = tableStatsStatus == null ? hmsExternalTable.getRowCount() : tableStatsStatus.rowCount;
// getRowCount may return 0 if cache is empty, in this case, call fetchRowCount.
long count = hmsExternalTable.getRowCount();
if (count == 0) {
count = hmsExternalTable.fetchRowCount();
}
Map<String, String> params = buildStatsParams("NULL");
Map<StatsType, String> statsParams = new HashMap<>();
@ -141,7 +143,7 @@ public class HMSAnalysisTask extends ExternalAnalysisTask {
statsParams.put(StatsType.MAX_VALUE, "max");
statsParams.put(StatsType.AVG_SIZE, "avg_len");
if (hmsExternalTable.fillColumnStatistics(info.colName, statsParams, params)) {
if (!hmsExternalTable.fillColumnStatistics(info.colName, statsParams, params)) {
throw new AnalysisException("some column stats not available");
}