[fix](statistics)Disable fetch min/max column stats through HMS, because the value may inaccurate and misleading. (#35124) (#35145)

backport #35124
This commit is contained in:
Jibing-Li
2024-05-21 22:58:12 +08:00
committed by GitHub
parent 009ab77c25
commit 37f1bf317c
2 changed files with 38 additions and 79 deletions

View File

@ -69,7 +69,6 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
@ -82,9 +81,6 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@ -697,8 +693,11 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
return Optional.empty();
}
Map<String, String> parameters = remoteTable.getParameters();
if (!parameters.containsKey(NUM_ROWS) || Long.parseLong(parameters.get(NUM_ROWS)) == 0) {
return Optional.empty();
}
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
double count = parameters.containsKey(NUM_ROWS) ? Double.parseDouble(parameters.get(NUM_ROWS)) : 0;
long count = Long.parseLong(parameters.get(NUM_ROWS));
columnStatisticBuilder.setCount(count);
// The tableStats length is at most 1.
for (ColumnStatisticsObj tableStat : tableStats) {
@ -719,12 +718,10 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
return Optional.of(columnStatisticBuilder.build());
}
private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, double count)
private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, long count)
throws AnalysisException {
long ndv = 0;
long nulls = 0;
String min = "";
String max = "";
double colSize = 0;
if (!data.isSetStringStats()) {
colSize = count * col.getType().getSlotSize();
@ -734,8 +731,6 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
LongColumnStatsData longStats = data.getLongStats();
ndv = longStats.getNumDVs();
nulls = longStats.getNumNulls();
min = String.valueOf(longStats.getLowValue());
max = String.valueOf(longStats.getHighValue());
} else if (data.isSetStringStats()) {
StringColumnStatsData stringStats = data.getStringStats();
ndv = stringStats.getNumDVs();
@ -746,67 +741,23 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
DecimalColumnStatsData decimalStats = data.getDecimalStats();
ndv = decimalStats.getNumDVs();
nulls = decimalStats.getNumNulls();
if (decimalStats.isSetLowValue()) {
Decimal lowValue = decimalStats.getLowValue();
if (lowValue != null) {
BigDecimal lowDecimal = new BigDecimal(new BigInteger(lowValue.getUnscaled()), lowValue.getScale());
min = lowDecimal.toString();
}
}
if (decimalStats.isSetHighValue()) {
Decimal highValue = decimalStats.getHighValue();
if (highValue != null) {
BigDecimal highDecimal =
new BigDecimal(new BigInteger(highValue.getUnscaled()), highValue.getScale());
max = highDecimal.toString();
}
}
} else if (data.isSetDoubleStats()) {
DoubleColumnStatsData doubleStats = data.getDoubleStats();
ndv = doubleStats.getNumDVs();
nulls = doubleStats.getNumNulls();
min = String.valueOf(doubleStats.getLowValue());
max = String.valueOf(doubleStats.getHighValue());
} else if (data.isSetDateStats()) {
DateColumnStatsData dateStats = data.getDateStats();
ndv = dateStats.getNumDVs();
nulls = dateStats.getNumNulls();
if (dateStats.isSetLowValue()) {
org.apache.hadoop.hive.metastore.api.Date lowValue = dateStats.getLowValue();
if (lowValue != null) {
LocalDate lowDate = LocalDate.ofEpochDay(lowValue.getDaysSinceEpoch());
min = lowDate.toString();
}
}
if (dateStats.isSetHighValue()) {
org.apache.hadoop.hive.metastore.api.Date highValue = dateStats.getHighValue();
if (highValue != null) {
LocalDate highDate = LocalDate.ofEpochDay(highValue.getDaysSinceEpoch());
max = highDate.toString();
}
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Not suitable data type for column %s", col.getName()));
}
throw new RuntimeException("Not supported data type.");
LOG.warn(String.format("Not suitable data type for column %s", col.getName()));
}
builder.setNdv(ndv);
builder.setNumNulls(nulls);
builder.setDataSize(colSize);
builder.setAvgSizeByte(colSize / count);
if (!min.equals("")) {
builder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
builder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
} else {
builder.setMinValue(Double.MIN_VALUE);
}
if (!max.equals("")) {
builder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
builder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
} else {
builder.setMaxValue(Double.MAX_VALUE);
}
builder.setMinValue(Double.NEGATIVE_INFINITY);
builder.setMaxValue(Double.POSITIVE_INFINITY);
}
public void setEventUpdateTime(long updateTime) {

View File

@ -55,8 +55,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "\'1992-01-04\'")
assertTrue(result[0][8] == "\'1998-12-31\'")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_tax)"""
assertTrue(result.size() == 1)
@ -66,8 +66,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
assertTrue(result[0][7] == "0")
assertTrue(result[0][8] == "0.08")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_shipmode)"""
assertTrue(result.size() == 1)
@ -77,6 +77,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.5717007E7")
assertTrue(result[0][6] == "4.285300060071169")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_suppkey)"""
assertTrue(result.size() == 1)
@ -86,8 +88,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "1")
assertTrue(result[0][8] == "7")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_shipdate)"""
assertTrue(result.size() == 1)
@ -97,8 +99,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "\'1992-01-02\'")
assertTrue(result[0][8] == "\'1998-12-01\'")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_commitdate)"""
assertTrue(result.size() == 1)
@ -108,8 +110,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "\'1992-01-31\'")
assertTrue(result[0][8] == "\'1998-10-31\'")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_partkey)"""
assertTrue(result.size() == 1)
@ -119,8 +121,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "1")
assertTrue(result[0][8] == "10000")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_orderkey)"""
assertTrue(result.size() == 1)
@ -130,8 +132,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "1")
assertTrue(result[0][8] == "6000000")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_quantity)"""
assertTrue(result.size() == 1)
@ -141,8 +143,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
assertTrue(result[0][7] == "1")
assertTrue(result[0][8] == "50")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_linestatus)"""
assertTrue(result.size() == 1)
@ -152,6 +154,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "6001215.0")
assertTrue(result[0][6] == "1.0")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_comment)"""
assertTrue(result.size() == 1)
@ -161,6 +165,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "1.5899739E8")
assertTrue(result[0][6] == "26.494199924515286")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_extendedprice)"""
assertTrue(result.size() == 1)
@ -170,8 +176,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
assertTrue(result[0][7] == "901")
assertTrue(result[0][8] == "104949.5")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_linenumber)"""
assertTrue(result.size() == 1)
@ -181,8 +187,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
assertTrue(result[0][7] == "1")
assertTrue(result[0][8] == "200000")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_discount)"""
assertTrue(result.size() == 1)
@ -192,8 +198,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
assertTrue(result[0][7] == "0")
assertTrue(result[0][8] == "0.1")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_shipinstruct)"""
assertTrue(result.size() == 1)
@ -203,6 +209,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "7.2006178E7")
assertTrue(result[0][6] == "11.998599950176756")
assertTrue(result[0][7] == "N/A")
assertTrue(result[0][8] == "N/A")
for (int i = 0; i < 10; i++) {
result = sql """show table stats lineitem"""