[fix](statistics)Disable fetch min/max column stats through HMS, because the value may inaccurate and misleading. (#35124) (#35145)
backport #35124
This commit is contained in:
@ -69,7 +69,6 @@ import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
|
||||
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
|
||||
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
|
||||
import org.apache.hadoop.hive.metastore.api.Decimal;
|
||||
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
|
||||
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
@ -82,9 +81,6 @@ import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.math.BigDecimal;
|
||||
import java.math.BigInteger;
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -697,8 +693,11 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
|
||||
return Optional.empty();
|
||||
}
|
||||
Map<String, String> parameters = remoteTable.getParameters();
|
||||
if (!parameters.containsKey(NUM_ROWS) || Long.parseLong(parameters.get(NUM_ROWS)) == 0) {
|
||||
return Optional.empty();
|
||||
}
|
||||
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
|
||||
double count = parameters.containsKey(NUM_ROWS) ? Double.parseDouble(parameters.get(NUM_ROWS)) : 0;
|
||||
long count = Long.parseLong(parameters.get(NUM_ROWS));
|
||||
columnStatisticBuilder.setCount(count);
|
||||
// The tableStats length is at most 1.
|
||||
for (ColumnStatisticsObj tableStat : tableStats) {
|
||||
@ -719,12 +718,10 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
|
||||
return Optional.of(columnStatisticBuilder.build());
|
||||
}
|
||||
|
||||
private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, double count)
|
||||
private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, long count)
|
||||
throws AnalysisException {
|
||||
long ndv = 0;
|
||||
long nulls = 0;
|
||||
String min = "";
|
||||
String max = "";
|
||||
double colSize = 0;
|
||||
if (!data.isSetStringStats()) {
|
||||
colSize = count * col.getType().getSlotSize();
|
||||
@ -734,8 +731,6 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
|
||||
LongColumnStatsData longStats = data.getLongStats();
|
||||
ndv = longStats.getNumDVs();
|
||||
nulls = longStats.getNumNulls();
|
||||
min = String.valueOf(longStats.getLowValue());
|
||||
max = String.valueOf(longStats.getHighValue());
|
||||
} else if (data.isSetStringStats()) {
|
||||
StringColumnStatsData stringStats = data.getStringStats();
|
||||
ndv = stringStats.getNumDVs();
|
||||
@ -746,67 +741,23 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI
|
||||
DecimalColumnStatsData decimalStats = data.getDecimalStats();
|
||||
ndv = decimalStats.getNumDVs();
|
||||
nulls = decimalStats.getNumNulls();
|
||||
if (decimalStats.isSetLowValue()) {
|
||||
Decimal lowValue = decimalStats.getLowValue();
|
||||
if (lowValue != null) {
|
||||
BigDecimal lowDecimal = new BigDecimal(new BigInteger(lowValue.getUnscaled()), lowValue.getScale());
|
||||
min = lowDecimal.toString();
|
||||
}
|
||||
}
|
||||
if (decimalStats.isSetHighValue()) {
|
||||
Decimal highValue = decimalStats.getHighValue();
|
||||
if (highValue != null) {
|
||||
BigDecimal highDecimal =
|
||||
new BigDecimal(new BigInteger(highValue.getUnscaled()), highValue.getScale());
|
||||
max = highDecimal.toString();
|
||||
}
|
||||
}
|
||||
} else if (data.isSetDoubleStats()) {
|
||||
DoubleColumnStatsData doubleStats = data.getDoubleStats();
|
||||
ndv = doubleStats.getNumDVs();
|
||||
nulls = doubleStats.getNumNulls();
|
||||
min = String.valueOf(doubleStats.getLowValue());
|
||||
max = String.valueOf(doubleStats.getHighValue());
|
||||
} else if (data.isSetDateStats()) {
|
||||
DateColumnStatsData dateStats = data.getDateStats();
|
||||
ndv = dateStats.getNumDVs();
|
||||
nulls = dateStats.getNumNulls();
|
||||
if (dateStats.isSetLowValue()) {
|
||||
org.apache.hadoop.hive.metastore.api.Date lowValue = dateStats.getLowValue();
|
||||
if (lowValue != null) {
|
||||
LocalDate lowDate = LocalDate.ofEpochDay(lowValue.getDaysSinceEpoch());
|
||||
min = lowDate.toString();
|
||||
}
|
||||
}
|
||||
if (dateStats.isSetHighValue()) {
|
||||
org.apache.hadoop.hive.metastore.api.Date highValue = dateStats.getHighValue();
|
||||
if (highValue != null) {
|
||||
LocalDate highDate = LocalDate.ofEpochDay(highValue.getDaysSinceEpoch());
|
||||
max = highDate.toString();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("Not suitable data type for column %s", col.getName()));
|
||||
}
|
||||
throw new RuntimeException("Not supported data type.");
|
||||
LOG.warn(String.format("Not suitable data type for column %s", col.getName()));
|
||||
}
|
||||
builder.setNdv(ndv);
|
||||
builder.setNumNulls(nulls);
|
||||
builder.setDataSize(colSize);
|
||||
builder.setAvgSizeByte(colSize / count);
|
||||
if (!min.equals("")) {
|
||||
builder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
|
||||
builder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
|
||||
} else {
|
||||
builder.setMinValue(Double.MIN_VALUE);
|
||||
}
|
||||
if (!max.equals("")) {
|
||||
builder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
|
||||
builder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
|
||||
} else {
|
||||
builder.setMaxValue(Double.MAX_VALUE);
|
||||
}
|
||||
builder.setMinValue(Double.NEGATIVE_INFINITY);
|
||||
builder.setMaxValue(Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
public void setEventUpdateTime(long updateTime) {
|
||||
|
||||
@ -55,8 +55,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "\'1992-01-04\'")
|
||||
assertTrue(result[0][8] == "\'1998-12-31\'")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_tax)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -66,8 +66,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "4.800972E7")
|
||||
assertTrue(result[0][6] == "8.0")
|
||||
assertTrue(result[0][7] == "0")
|
||||
assertTrue(result[0][8] == "0.08")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_shipmode)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -77,6 +77,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.5717007E7")
|
||||
assertTrue(result[0][6] == "4.285300060071169")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_suppkey)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -86,8 +88,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "1")
|
||||
assertTrue(result[0][8] == "7")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_shipdate)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -97,8 +99,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "\'1992-01-02\'")
|
||||
assertTrue(result[0][8] == "\'1998-12-01\'")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_commitdate)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -108,8 +110,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "\'1992-01-31\'")
|
||||
assertTrue(result[0][8] == "\'1998-10-31\'")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_partkey)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -119,8 +121,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "1")
|
||||
assertTrue(result[0][8] == "10000")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_orderkey)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -130,8 +132,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "1")
|
||||
assertTrue(result[0][8] == "6000000")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_quantity)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -141,8 +143,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "4.800972E7")
|
||||
assertTrue(result[0][6] == "8.0")
|
||||
assertTrue(result[0][7] == "1")
|
||||
assertTrue(result[0][8] == "50")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_linestatus)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -152,6 +154,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "6001215.0")
|
||||
assertTrue(result[0][6] == "1.0")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_comment)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -161,6 +165,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "1.5899739E8")
|
||||
assertTrue(result[0][6] == "26.494199924515286")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_extendedprice)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -170,8 +176,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "4.800972E7")
|
||||
assertTrue(result[0][6] == "8.0")
|
||||
assertTrue(result[0][7] == "901")
|
||||
assertTrue(result[0][8] == "104949.5")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_linenumber)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -181,8 +187,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "2.400486E7")
|
||||
assertTrue(result[0][6] == "4.0")
|
||||
assertTrue(result[0][7] == "1")
|
||||
assertTrue(result[0][8] == "200000")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_discount)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -192,8 +198,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "4.800972E7")
|
||||
assertTrue(result[0][6] == "8.0")
|
||||
assertTrue(result[0][7] == "0")
|
||||
assertTrue(result[0][8] == "0.1")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
result = sql """show column cached stats lineitem (l_shipinstruct)"""
|
||||
assertTrue(result.size() == 1)
|
||||
@ -203,6 +209,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa
|
||||
assertTrue(result[0][4] == "0.0")
|
||||
assertTrue(result[0][5] == "7.2006178E7")
|
||||
assertTrue(result[0][6] == "11.998599950176756")
|
||||
assertTrue(result[0][7] == "N/A")
|
||||
assertTrue(result[0][8] == "N/A")
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
result = sql """show table stats lineitem"""
|
||||
|
||||
Reference in New Issue
Block a user