[Improvement](statistics)Show column stats even when error occurred (#23703)

Before, show column stats will ignore column with error.
In this pr, when min or max value failed to deserialize, show column stats will use N/A as value of min or max, and still show the rest stats. (count, null_count, ndv and so on).
This commit is contained in:
Jibing-Li
2023-09-01 10:57:37 +08:00
committed by GitHub
parent b93a1a83a5
commit 9a7e8b298a
2 changed files with 63 additions and 5 deletions

View File

@ -21,6 +21,7 @@ import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.PartitionInfo;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.base.Preconditions;
@ -168,21 +169,31 @@ public class ColumnStatistic {
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
try {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} min value {}.", col, min, e);
columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
}
} else {
columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
try {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} max value {}.", col, max, e);
columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
}
} else {
columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
}
columnStatisticBuilder.setUpdatedTime(row.get(13));
return columnStatisticBuilder.build();
} catch (Exception e) {
LOG.warn("Failed to deserialize column statistics, column not exists", e);
LOG.warn("Failed to deserialize column statistics.", e);
return ColumnStatistic.UNKNOWN;
}
}

View File

@ -243,6 +243,53 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
sql """drop stats statistics"""
result = sql """show column cached stats statistics"""
assertTrue(result.size() == 0)
sql """use multi_catalog"""
sql """analyze table logs1_parquet (log_time) with sync"""
def ctlId
def dbId
def tblId
result = sql """show proc '/catalogs'"""
for (int i = 0; i < result.size(); i++) {
if (result[i][1] == catalog_name) {
ctlId = result[i][0]
}
}
result = sql """show proc '/catalogs/$ctlId'"""
for (int i = 0; i < result.size(); i++) {
if (result[i][1] == 'multi_catalog') {
dbId = result[i][0]
}
}
result = sql """show proc '/catalogs/$ctlId/$dbId'"""
for (int i = 0; i < result.size(); i++) {
if (result[i][1] == 'logs1_parquet') {
tblId = result[i][0]
}
}
result = sql """select * from internal.__internal_schema.column_statistics where id = '${tblId}--1-log_time'"""
assertTrue(result.size() == 1)
def id = result[0][0]
def catalog_id = result[0][1]
def db_id = result[0][2]
def tbl_id = result[0][3]
def idx_id = result[0][4]
def col_id = result[0][5]
def count = result[0][7]
def ndv = result[0][8]
def null_count = result[0][9]
def data_size_in_bytes = result[0][12]
def update_time = result[0][13]
sql """insert into internal.__internal_schema.column_statistics values ('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count, $ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time')"""
result = sql """show column stats logs1_parquet (log_time)"""
assertTrue(result.size() == 1)
assertTrue(result[0][6] == "N/A")
assertTrue(result[0][7] == "N/A")
sql """drop catalog ${catalog_name}"""
}
}