[fix](stats) Store max/min by base64

This commit is contained in:
AKIRA
2023-11-01 14:31:35 +08:00
committed by GitHub
parent 7ba4f91258
commit 268c69971d
9 changed files with 64 additions and 20 deletions

View File

@ -78,7 +78,8 @@ public abstract class BaseAnalysisTask {
protected static final String INSERT_COL_STATISTICS = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
+ " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, "
+ " ndv, null_count, CAST(min AS string), CAST(max AS string), data_size, update_time\n"
+ " ndv, null_count,"
+ " to_base64(CAST(min AS string)), to_base64(CAST(max AS string)), data_size, update_time\n"
+ " FROM \n"
+ " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+ " ${catalogId} AS catalog_id, "
@ -89,8 +90,8 @@ public abstract class BaseAnalysisTask {
+ " NULL AS part_id, "
+ " SUM(count) AS row_count, \n"
+ " SUM(null_count) AS null_count, "
+ " MIN(CAST(min AS ${type})) AS min, "
+ " MAX(CAST(max AS ${type})) AS max, "
+ " MIN(CAST(from_base64(min) AS ${type})) AS min, "
+ " MAX(CAST(from_base64(max) AS ${type})) AS max, "
+ " SUM(data_size_in_bytes) AS data_size, "
+ " NOW() AS update_time \n"
+ " FROM ${internalDB}.${columnStatTbl}"
@ -114,8 +115,8 @@ public abstract class BaseAnalysisTask {
+ "${row_count} AS row_count, "
+ "${ndv} AS ndv, "
+ "${null_count} AS null_count, "
+ "'${min}' AS min, "
+ "'${max}' AS max, "
+ "to_base64('${min}') AS min, "
+ "to_base64('${max}') AS max, "
+ "${data_size} AS data_size, "
+ "NOW() ";
@ -241,7 +242,7 @@ public abstract class BaseAnalysisTask {
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMinFunction() {
if (tableSample == null) {
return "MIN(`${colName}`) ";
return "MIN(CAST(min AS ${type}))";
} else {
return "NULL ";
}
@ -250,7 +251,7 @@ public abstract class BaseAnalysisTask {
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMaxFunction() {
if (tableSample == null) {
return "MAX(`${colName}`) ";
return "MAX(CAST(min AS ${type}))";
} else {
return "NULL ";
}

View File

@ -19,6 +19,8 @@ package org.apache.doris.statistics;
import org.apache.doris.statistics.util.StatisticsUtil;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.StringJoiner;
/**
@ -73,8 +75,10 @@ public class ColStatsData {
sj.add(String.valueOf(count));
sj.add(String.valueOf(ndv));
sj.add(String.valueOf(nullCount));
sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(minLit)));
sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(maxLit)));
sj.add(minLit == null ? "NULL" :
"'" + Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) + "'");
sj.add(maxLit == null ? "NULL" :
"'" + Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) + "'");
sj.add(String.valueOf(dataSizeInBytes));
sj.add(StatisticsUtil.quote(updateTime));
return sj.toString();

View File

@ -30,6 +30,8 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -172,6 +174,9 @@ public class ColumnStatistic {
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
min = new String(Base64.getDecoder().decode(min),
StandardCharsets.UTF_8);
try {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
@ -183,6 +188,10 @@ public class ColumnStatistic {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
max = new String(Base64.getDecoder().decode(max),
StandardCharsets.UTF_8);
try {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));

View File

@ -64,8 +64,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
+ "${minFunction} AS min, "
+ "${maxFunction} AS max, "
+ "to_base64(${minFunction}) AS min, "
+ "to_base64(${maxFunction}) AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}";
@ -81,8 +81,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "COUNT(1) AS row_count, "
+ "NDV(`${colName}`) AS ndv, "
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "to_base64(MIN(`${colName}`)) AS min, "
+ "to_base64(MAX(`${colName}`)) AS max, "
+ "${dataSizeFunction} AS data_size, "
+ "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where ";

View File

@ -49,8 +49,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask {
+ "COUNT(1) AS row_count, "
+ "NDV(`${colName}`) AS ndv, "
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "to_base64(MIN(`${colName}`)) AS min, "
+ "to_base64(MAX(`${colName}`)) AS max, "
+ "${dataSizeFunction} AS data_size, "
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}`";

View File

@ -37,6 +37,7 @@ import org.apache.commons.text.StringSubstitutor;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
@ -121,8 +122,9 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
List<Long> tabletIds = pair.first;
double scaleFactor = (double) tbl.getRowCount() / (double) pair.second;
// might happen if row count in fe metadata hasn't been updated yet
if (Double.isInfinite(scaleFactor)) {
if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) {
scaleFactor = 1;
tabletIds = Collections.emptyList();
}
String tabletStr = tabletIds.stream()
.map(Object::toString)