[fix](stats)min and max return NaN when table is empty (#27862)

fix analyze empty table and min/max null value bug:
1. Skip empty analyze task for sample analyze task. (Full analyze task already skipped).
2. Check sample rows is not 0 before calculate the scale factor.
3. Remove ' in sql template after remove base64 encoding for min/max value.
This commit is contained in:
Jibing-Li
2023-12-01 17:00:56 +08:00
committed by GitHub
parent 18338a33b6
commit 26e81b6573
7 changed files with 56 additions and 32 deletions

View File

@ -92,8 +92,8 @@ public abstract class BaseAnalysisTask {
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
+ "'${min}' AS `min`, "
+ "'${max}' AS `max`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
@ -115,8 +115,8 @@ public abstract class BaseAnalysisTask {
+ "${row_count} AS `row_count`, "
+ "${ndv} AS `ndv`, "
+ "${null_count} AS `null_count`, "
+ "'${min}' AS `min`, "
+ "'${max}' AS `max`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "
+ "${data_size} AS `data_size`, "
+ "NOW() ";
@ -311,7 +311,7 @@ public abstract class BaseAnalysisTask {
this.job = job;
}
protected void runQuery(String sql, boolean needEncode) {
protected void runQuery(String sql) {
long startTime = System.currentTimeMillis();
try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) {
stmtExecutor = new StmtExecutor(a.connectContext, sql);

View File

@ -159,7 +159,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
}
stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
runQuery(sql, true);
runQuery(sql);
}
// Collect the partition column stats through HMS metadata.
@ -201,12 +201,12 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
params.put("row_count", String.valueOf(count));
params.put("ndv", String.valueOf(ndv));
params.put("null_count", String.valueOf(numNulls));
params.put("min", min);
params.put("max", max);
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
params.put("data_size", String.valueOf(dataSize));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
runQuery(sql, true);
runQuery(sql);
}
private String updateMinValue(String currentMin, String value) {
@ -313,6 +313,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
for (long size : chunkSizes) {
total += size;
}
if (total == 0) {
return Pair.of(1.0, 0L);
}
// Calculate the sample target size for percent and rows sample.
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;

View File

@ -110,7 +110,7 @@ public class JdbcAnalysisTask extends BaseAnalysisTask {
params.put("dataSizeFunction", getDataSizeFunction(col, false));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
runQuery(sql, true);
runQuery(sql);
}
private Map<String, String> buildTableStatsParams(String partId) {

View File

@ -59,7 +59,13 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
public void doExecute() throws Exception {
Set<String> partitionNames = info.colToPartitions.get(info.colName);
if (partitionNames.isEmpty()) {
LOG.debug("Skip empty empty partition task for column {} in {}.{}.{}",
info.catalogId, info.dbId, info.tblId, info.colName);
job.appendBuf(this, Collections.emptyList());
return;
}
if (tableSample != null) {
doSample();
} else {
@ -113,24 +119,25 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("scaleFactor", String.valueOf(scaleFactor));
params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr));
params.put("ndvFunction", getNdvFunction(String.valueOf(rowCount)));
params.put("min", min);
params.put("max", max);
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
params.put("rowCount", String.valueOf(rowCount));
params.put("type", col.getType().toString());
params.put("limit", "");
if (needLimit()) {
// If the tablets to be sampled are too large, use limit to control the rows to read, and re-calculate
// the scaleFactor.
limitFlag = true;
rowsToSample = Math.min(getSampleRows(), pair.second);
params.put("limit", "limit " + rowsToSample);
params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample));
// Empty table doesn't need to limit.
if (rowsToSample > 0) {
limitFlag = true;
params.put("limit", "limit " + rowsToSample);
params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample));
}
}
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql;
if (useLinearAnalyzeTemplate()) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
// For single unique key, use count as ndv.
if (isSingleUniqueKey()) {
params.put("ndvFunction", String.valueOf(rowCount));
@ -148,7 +155,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"),
limitFlag, tbl.isDistributionColumn(col.getName()),
tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey());
runQuery(sql, false);
runQuery(sql);
}
}
@ -169,11 +176,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
*/
protected void doFull() throws Exception {
LOG.debug("Will do full collection for column {}", col.getName());
Set<String> partitionNames = info.colToPartitions.get(info.colName);
if (partitionNames.isEmpty()) {
job.appendBuf(this, Collections.emptyList());
return;
}
Map<String, String> params = new HashMap<>();
params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
@ -189,7 +191,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("tblName", String.valueOf(tbl.getName()));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String collectColStats = stringSubstitutor.replace(COLLECT_COL_STATISTICS);
runQuery(collectColStats, true);
runQuery(collectColStats);
}
// Get sample tablets id and scale up scaleFactor