[fix](stats) Store max/min by base64
This commit is contained in:
@ -78,7 +78,8 @@ public abstract class BaseAnalysisTask {
|
||||
protected static final String INSERT_COL_STATISTICS = "INSERT INTO "
|
||||
+ "${internalDB}.${columnStatTbl}"
|
||||
+ " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, "
|
||||
+ " ndv, null_count, CAST(min AS string), CAST(max AS string), data_size, update_time\n"
|
||||
+ " ndv, null_count,"
|
||||
+ " to_base64(CAST(min AS string)), to_base64(CAST(max AS string)), data_size, update_time\n"
|
||||
+ " FROM \n"
|
||||
+ " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
|
||||
+ " ${catalogId} AS catalog_id, "
|
||||
@ -89,8 +90,8 @@ public abstract class BaseAnalysisTask {
|
||||
+ " NULL AS part_id, "
|
||||
+ " SUM(count) AS row_count, \n"
|
||||
+ " SUM(null_count) AS null_count, "
|
||||
+ " MIN(CAST(min AS ${type})) AS min, "
|
||||
+ " MAX(CAST(max AS ${type})) AS max, "
|
||||
+ " MIN(CAST(from_base64(min) AS ${type})) AS min, "
|
||||
+ " MAX(CAST(from_base64(max) AS ${type})) AS max, "
|
||||
+ " SUM(data_size_in_bytes) AS data_size, "
|
||||
+ " NOW() AS update_time \n"
|
||||
+ " FROM ${internalDB}.${columnStatTbl}"
|
||||
@ -114,8 +115,8 @@ public abstract class BaseAnalysisTask {
|
||||
+ "${row_count} AS row_count, "
|
||||
+ "${ndv} AS ndv, "
|
||||
+ "${null_count} AS null_count, "
|
||||
+ "'${min}' AS min, "
|
||||
+ "'${max}' AS max, "
|
||||
+ "to_base64('${min}') AS min, "
|
||||
+ "to_base64('${max}') AS max, "
|
||||
+ "${data_size} AS data_size, "
|
||||
+ "NOW() ";
|
||||
|
||||
@ -241,7 +242,7 @@ public abstract class BaseAnalysisTask {
|
||||
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
|
||||
protected String getMinFunction() {
|
||||
if (tableSample == null) {
|
||||
return "MIN(`${colName}`) ";
|
||||
return "MIN(CAST(min AS ${type}))";
|
||||
} else {
|
||||
return "NULL ";
|
||||
}
|
||||
@ -250,7 +251,7 @@ public abstract class BaseAnalysisTask {
|
||||
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
|
||||
protected String getMaxFunction() {
|
||||
if (tableSample == null) {
|
||||
return "MAX(`${colName}`) ";
|
||||
return "MAX(CAST(min AS ${type}))";
|
||||
} else {
|
||||
return "NULL ";
|
||||
}
|
||||
|
||||
@ -19,6 +19,8 @@ package org.apache.doris.statistics;
|
||||
|
||||
import org.apache.doris.statistics.util.StatisticsUtil;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Base64;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
/**
|
||||
@ -73,8 +75,10 @@ public class ColStatsData {
|
||||
sj.add(String.valueOf(count));
|
||||
sj.add(String.valueOf(ndv));
|
||||
sj.add(String.valueOf(nullCount));
|
||||
sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(minLit)));
|
||||
sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(maxLit)));
|
||||
sj.add(minLit == null ? "NULL" :
|
||||
"'" + Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) + "'");
|
||||
sj.add(maxLit == null ? "NULL" :
|
||||
"'" + Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) + "'");
|
||||
sj.add(String.valueOf(dataSizeInBytes));
|
||||
sj.add(StatisticsUtil.quote(updateTime));
|
||||
return sj.toString();
|
||||
|
||||
@ -30,6 +30,8 @@ import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Base64;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -172,6 +174,9 @@ public class ColumnStatistic {
|
||||
String min = row.get(10);
|
||||
String max = row.get(11);
|
||||
if (min != null && !min.equalsIgnoreCase("NULL")) {
|
||||
min = new String(Base64.getDecoder().decode(min),
|
||||
StandardCharsets.UTF_8);
|
||||
|
||||
try {
|
||||
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
|
||||
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
|
||||
@ -183,6 +188,10 @@ public class ColumnStatistic {
|
||||
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
if (max != null && !max.equalsIgnoreCase("NULL")) {
|
||||
|
||||
max = new String(Base64.getDecoder().decode(max),
|
||||
StandardCharsets.UTF_8);
|
||||
|
||||
try {
|
||||
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
|
||||
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
|
||||
|
||||
@ -64,8 +64,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
|
||||
+ NDV_SAMPLE_TEMPLATE
|
||||
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
|
||||
+ "${minFunction} AS min, "
|
||||
+ "${maxFunction} AS max, "
|
||||
+ "to_base64(${minFunction}) AS min, "
|
||||
+ "to_base64(${maxFunction}) AS max, "
|
||||
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
|
||||
+ "NOW() "
|
||||
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}";
|
||||
@ -81,8 +81,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
|
||||
+ "COUNT(1) AS row_count, "
|
||||
+ "NDV(`${colName}`) AS ndv, "
|
||||
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, "
|
||||
+ "MIN(`${colName}`) AS min, "
|
||||
+ "MAX(`${colName}`) AS max, "
|
||||
+ "to_base64(MIN(`${colName}`)) AS min, "
|
||||
+ "to_base64(MAX(`${colName}`)) AS max, "
|
||||
+ "${dataSizeFunction} AS data_size, "
|
||||
+ "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where ";
|
||||
|
||||
|
||||
@ -49,8 +49,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask {
|
||||
+ "COUNT(1) AS row_count, "
|
||||
+ "NDV(`${colName}`) AS ndv, "
|
||||
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, "
|
||||
+ "MIN(`${colName}`) AS min, "
|
||||
+ "MAX(`${colName}`) AS max, "
|
||||
+ "to_base64(MIN(`${colName}`)) AS min, "
|
||||
+ "to_base64(MAX(`${colName}`)) AS max, "
|
||||
+ "${dataSizeFunction} AS data_size, "
|
||||
+ "NOW() "
|
||||
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}`";
|
||||
|
||||
@ -37,6 +37,7 @@ import org.apache.commons.text.StringSubstitutor;
|
||||
import java.security.SecureRandom;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -121,8 +122,9 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
List<Long> tabletIds = pair.first;
|
||||
double scaleFactor = (double) tbl.getRowCount() / (double) pair.second;
|
||||
// might happen if row count in fe metadata hasn't been updated yet
|
||||
if (Double.isInfinite(scaleFactor)) {
|
||||
if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) {
|
||||
scaleFactor = 1;
|
||||
tabletIds = Collections.emptyList();
|
||||
}
|
||||
String tabletStr = tabletIds.stream()
|
||||
.map(Object::toString)
|
||||
|
||||
Reference in New Issue
Block a user