[Improvement](statistics)Show column stats even when error occurred (#23703)
Before, show column stats will ignore column with error. In this pr, when min or max value failed to deserialize, show column stats will use N/A as value of min or max, and still show the rest stats. (count, null_count, ndv and so on).
This commit is contained in:
@ -21,6 +21,7 @@ import org.apache.doris.analysis.LiteralExpr;
|
||||
import org.apache.doris.catalog.Column;
|
||||
import org.apache.doris.catalog.PartitionInfo;
|
||||
import org.apache.doris.catalog.Type;
|
||||
import org.apache.doris.common.AnalysisException;
|
||||
import org.apache.doris.statistics.util.StatisticsUtil;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
@ -168,21 +169,31 @@ public class ColumnStatistic {
|
||||
String min = row.get(10);
|
||||
String max = row.get(11);
|
||||
if (min != null && !min.equalsIgnoreCase("NULL")) {
|
||||
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
|
||||
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
|
||||
try {
|
||||
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
|
||||
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
|
||||
} catch (AnalysisException e) {
|
||||
LOG.warn("Failed to deserialize column {} min value {}.", col, min, e);
|
||||
columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
|
||||
}
|
||||
} else {
|
||||
columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
|
||||
}
|
||||
if (max != null && !max.equalsIgnoreCase("NULL")) {
|
||||
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
|
||||
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
|
||||
try {
|
||||
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
|
||||
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
|
||||
} catch (AnalysisException e) {
|
||||
LOG.warn("Failed to deserialize column {} max value {}.", col, max, e);
|
||||
columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
|
||||
}
|
||||
} else {
|
||||
columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
|
||||
}
|
||||
columnStatisticBuilder.setUpdatedTime(row.get(13));
|
||||
return columnStatisticBuilder.build();
|
||||
} catch (Exception e) {
|
||||
LOG.warn("Failed to deserialize column statistics, column not exists", e);
|
||||
LOG.warn("Failed to deserialize column statistics.", e);
|
||||
return ColumnStatistic.UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
@ -243,6 +243,53 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h
|
||||
sql """drop stats statistics"""
|
||||
result = sql """show column cached stats statistics"""
|
||||
assertTrue(result.size() == 0)
|
||||
|
||||
sql """use multi_catalog"""
|
||||
sql """analyze table logs1_parquet (log_time) with sync"""
|
||||
def ctlId
|
||||
def dbId
|
||||
def tblId
|
||||
result = sql """show proc '/catalogs'"""
|
||||
|
||||
for (int i = 0; i < result.size(); i++) {
|
||||
if (result[i][1] == catalog_name) {
|
||||
ctlId = result[i][0]
|
||||
}
|
||||
}
|
||||
result = sql """show proc '/catalogs/$ctlId'"""
|
||||
for (int i = 0; i < result.size(); i++) {
|
||||
if (result[i][1] == 'multi_catalog') {
|
||||
dbId = result[i][0]
|
||||
}
|
||||
}
|
||||
result = sql """show proc '/catalogs/$ctlId/$dbId'"""
|
||||
for (int i = 0; i < result.size(); i++) {
|
||||
if (result[i][1] == 'logs1_parquet') {
|
||||
tblId = result[i][0]
|
||||
}
|
||||
}
|
||||
|
||||
result = sql """select * from internal.__internal_schema.column_statistics where id = '${tblId}--1-log_time'"""
|
||||
assertTrue(result.size() == 1)
|
||||
def id = result[0][0]
|
||||
def catalog_id = result[0][1]
|
||||
def db_id = result[0][2]
|
||||
def tbl_id = result[0][3]
|
||||
def idx_id = result[0][4]
|
||||
def col_id = result[0][5]
|
||||
def count = result[0][7]
|
||||
def ndv = result[0][8]
|
||||
def null_count = result[0][9]
|
||||
def data_size_in_bytes = result[0][12]
|
||||
def update_time = result[0][13]
|
||||
|
||||
sql """insert into internal.__internal_schema.column_statistics values ('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count, $ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time')"""
|
||||
|
||||
result = sql """show column stats logs1_parquet (log_time)"""
|
||||
assertTrue(result.size() == 1)
|
||||
assertTrue(result[0][6] == "N/A")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
sql """drop catalog ${catalog_name}"""
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user