From 37f1bf317cebc1cef749e95b07a87a8efd92fcf6 Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Tue, 21 May 2024 22:58:12 +0800 Subject: [PATCH] [fix](statistics)Disable fetch min/max column stats through HMS, because the value may inaccurate and misleading. (#35124) (#35145) backport #35124 --- .../datasource/hive/HMSExternalTable.java | 65 +++---------------- .../hive/test_hive_statistics_from_hms.groovy | 52 ++++++++------- 2 files changed, 38 insertions(+), 79 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java index e5624fb58b..4d3f963aa5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java @@ -69,7 +69,6 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; -import org.apache.hadoop.hive.metastore.api.Decimal; import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.FieldSchema; @@ -82,9 +81,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.time.LocalDate; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -697,8 +693,11 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI return Optional.empty(); } Map parameters = remoteTable.getParameters(); + if (!parameters.containsKey(NUM_ROWS) || Long.parseLong(parameters.get(NUM_ROWS)) == 0) { + return Optional.empty(); + } ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); - double count = parameters.containsKey(NUM_ROWS) ? Double.parseDouble(parameters.get(NUM_ROWS)) : 0; + long count = Long.parseLong(parameters.get(NUM_ROWS)); columnStatisticBuilder.setCount(count); // The tableStats length is at most 1. for (ColumnStatisticsObj tableStat : tableStats) { @@ -719,12 +718,10 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI return Optional.of(columnStatisticBuilder.build()); } - private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, double count) + private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticBuilder builder, long count) throws AnalysisException { long ndv = 0; long nulls = 0; - String min = ""; - String max = ""; double colSize = 0; if (!data.isSetStringStats()) { colSize = count * col.getType().getSlotSize(); @@ -734,8 +731,6 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI LongColumnStatsData longStats = data.getLongStats(); ndv = longStats.getNumDVs(); nulls = longStats.getNumNulls(); - min = String.valueOf(longStats.getLowValue()); - max = String.valueOf(longStats.getHighValue()); } else if (data.isSetStringStats()) { StringColumnStatsData stringStats = data.getStringStats(); ndv = stringStats.getNumDVs(); @@ -746,67 +741,23 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI DecimalColumnStatsData decimalStats = data.getDecimalStats(); ndv = decimalStats.getNumDVs(); nulls = decimalStats.getNumNulls(); - if (decimalStats.isSetLowValue()) { - Decimal lowValue = decimalStats.getLowValue(); - if (lowValue != null) { - BigDecimal lowDecimal = new BigDecimal(new BigInteger(lowValue.getUnscaled()), lowValue.getScale()); - min = lowDecimal.toString(); - } - } - if (decimalStats.isSetHighValue()) { - Decimal highValue = decimalStats.getHighValue(); - if (highValue != null) { - BigDecimal highDecimal = - new BigDecimal(new BigInteger(highValue.getUnscaled()), highValue.getScale()); - max = highDecimal.toString(); - } - } } else if (data.isSetDoubleStats()) { DoubleColumnStatsData doubleStats = data.getDoubleStats(); ndv = doubleStats.getNumDVs(); nulls = doubleStats.getNumNulls(); - min = String.valueOf(doubleStats.getLowValue()); - max = String.valueOf(doubleStats.getHighValue()); } else if (data.isSetDateStats()) { DateColumnStatsData dateStats = data.getDateStats(); ndv = dateStats.getNumDVs(); nulls = dateStats.getNumNulls(); - if (dateStats.isSetLowValue()) { - org.apache.hadoop.hive.metastore.api.Date lowValue = dateStats.getLowValue(); - if (lowValue != null) { - LocalDate lowDate = LocalDate.ofEpochDay(lowValue.getDaysSinceEpoch()); - min = lowDate.toString(); - } - } - if (dateStats.isSetHighValue()) { - org.apache.hadoop.hive.metastore.api.Date highValue = dateStats.getHighValue(); - if (highValue != null) { - LocalDate highDate = LocalDate.ofEpochDay(highValue.getDaysSinceEpoch()); - max = highDate.toString(); - } - } } else { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("Not suitable data type for column %s", col.getName())); - } - throw new RuntimeException("Not supported data type."); + LOG.warn(String.format("Not suitable data type for column %s", col.getName())); } builder.setNdv(ndv); builder.setNumNulls(nulls); builder.setDataSize(colSize); builder.setAvgSizeByte(colSize / count); - if (!min.equals("")) { - builder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min)); - builder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min)); - } else { - builder.setMinValue(Double.MIN_VALUE); - } - if (!max.equals("")) { - builder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max)); - builder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max)); - } else { - builder.setMaxValue(Double.MAX_VALUE); - } + builder.setMinValue(Double.NEGATIVE_INFINITY); + builder.setMaxValue(Double.POSITIVE_INFINITY); } public void setEventUpdateTime(long updateTime) { diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy index c3c671bb03..3a067fa42f 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy @@ -55,8 +55,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "\'1992-01-04\'") - assertTrue(result[0][8] == "\'1998-12-31\'") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_tax)""" assertTrue(result.size() == 1) @@ -66,8 +66,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "4.800972E7") assertTrue(result[0][6] == "8.0") - assertTrue(result[0][7] == "0") - assertTrue(result[0][8] == "0.08") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_shipmode)""" assertTrue(result.size() == 1) @@ -77,6 +77,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.5717007E7") assertTrue(result[0][6] == "4.285300060071169") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_suppkey)""" assertTrue(result.size() == 1) @@ -86,8 +88,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "1") - assertTrue(result[0][8] == "7") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_shipdate)""" assertTrue(result.size() == 1) @@ -97,8 +99,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "\'1992-01-02\'") - assertTrue(result[0][8] == "\'1998-12-01\'") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_commitdate)""" assertTrue(result.size() == 1) @@ -108,8 +110,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "\'1992-01-31\'") - assertTrue(result[0][8] == "\'1998-10-31\'") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_partkey)""" assertTrue(result.size() == 1) @@ -119,8 +121,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "1") - assertTrue(result[0][8] == "10000") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_orderkey)""" assertTrue(result.size() == 1) @@ -130,8 +132,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "1") - assertTrue(result[0][8] == "6000000") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_quantity)""" assertTrue(result.size() == 1) @@ -141,8 +143,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "4.800972E7") assertTrue(result[0][6] == "8.0") - assertTrue(result[0][7] == "1") - assertTrue(result[0][8] == "50") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_linestatus)""" assertTrue(result.size() == 1) @@ -152,6 +154,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "6001215.0") assertTrue(result[0][6] == "1.0") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_comment)""" assertTrue(result.size() == 1) @@ -161,6 +165,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "1.5899739E8") assertTrue(result[0][6] == "26.494199924515286") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_extendedprice)""" assertTrue(result.size() == 1) @@ -170,8 +176,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "4.800972E7") assertTrue(result[0][6] == "8.0") - assertTrue(result[0][7] == "901") - assertTrue(result[0][8] == "104949.5") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_linenumber)""" assertTrue(result.size() == 1) @@ -181,8 +187,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "2.400486E7") assertTrue(result[0][6] == "4.0") - assertTrue(result[0][7] == "1") - assertTrue(result[0][8] == "200000") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_discount)""" assertTrue(result.size() == 1) @@ -192,8 +198,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "4.800972E7") assertTrue(result[0][6] == "8.0") - assertTrue(result[0][7] == "0") - assertTrue(result[0][8] == "0.1") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") result = sql """show column cached stats lineitem (l_shipinstruct)""" assertTrue(result.size() == 1) @@ -203,6 +209,8 @@ suite("test_hive_statistics_from_hms", "p2,external,hive,external_remote,externa assertTrue(result[0][4] == "0.0") assertTrue(result[0][5] == "7.2006178E7") assertTrue(result[0][6] == "11.998599950176756") + assertTrue(result[0][7] == "N/A") + assertTrue(result[0][8] == "N/A") for (int i = 0; i < 10; i++) { result = sql """show table stats lineitem"""