[Fix](orc-reader) Fix orc dict filter null value issue in _convert_dict_cols_to_string_cols which caused incorrect result. (#21047)
Query results should not have empty values. ``` use regresssion.multi_catalog; select commit_id from github_events_orc WHERE (event_type = 'CommitCommentEvent') AND commit_id != "" limit 10; ``` ``` +------------------------------------------+ | commit_id | +------------------------------------------+ | 685c1fd8dbbdc10c042932f9a9f88be00ff96c75 | | 685c1fd8dbbdc10c042932f9a9f88be00ff96c75 | | 4e3ab2ff2d2474f5d51334b9b0fdf17e9845a166 | | | | | | | | | | | | | | 7191c20cb49da07a7fc16aa32dc0de4faff528b2 | +------------------------------------------+ 10 rows in set (0.54 sec) ```
This commit is contained in:
@ -64,6 +64,8 @@ public:
|
||||
using Char = UInt8;
|
||||
using Chars = PaddedPODArray<UInt8>;
|
||||
|
||||
static constexpr size_t MAX_STRINGS_OVERFLOW_SIZE = 128;
|
||||
|
||||
void static check_chars_length(size_t total_length, size_t element_number) {
|
||||
if (UNLIKELY(total_length > MAX_STRING_SIZE)) {
|
||||
throw Exception(ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE,
|
||||
|
||||
@ -94,6 +94,7 @@ namespace doris::vectorized {
|
||||
|
||||
// TODO: we need to determine it by test.
|
||||
static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits<uint32_t>::max();
|
||||
static constexpr char EMPTY_STRING_FOR_OVERFLOW[ColumnString::MAX_STRINGS_OVERFLOW_SIZE] = "";
|
||||
|
||||
#define FOR_FLAT_ORC_COLUMNS(M) \
|
||||
M(TypeIndex::Int8, Int8, orc::LongVectorBatch) \
|
||||
@ -1039,7 +1040,6 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
|
||||
const orc::TypeKind& type_kind,
|
||||
orc::EncodedStringVectorBatch* cvb,
|
||||
size_t num_values) {
|
||||
const static std::string empty_string;
|
||||
std::vector<StringRef> string_values;
|
||||
size_t max_value_length = 0;
|
||||
string_values.reserve(num_values);
|
||||
@ -1054,7 +1054,7 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
|
||||
if (cvb->notNull[i]) {
|
||||
if constexpr (is_filter) {
|
||||
if (!filter_data[i]) {
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1070,14 +1070,14 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
|
||||
// Orc doesn't fill null values in new batch, but the former batch has been release.
|
||||
// Other types like int/long/timestamp... are flat types without pointer in them,
|
||||
// so other types do not need to be handled separately like string.
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < num_values; ++i) {
|
||||
if constexpr (is_filter) {
|
||||
if (!filter_data[i]) {
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1097,23 +1097,26 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
|
||||
if (cvb->notNull[i]) {
|
||||
if constexpr (is_filter) {
|
||||
if (!filter_data[i]) {
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
char* val_ptr;
|
||||
int64_t length;
|
||||
cvb->dictionary->getValueByIndex(cvb->index.data()[i], val_ptr, length);
|
||||
if (length > max_value_length) {
|
||||
max_value_length = length;
|
||||
}
|
||||
string_values.emplace_back(val_ptr, length);
|
||||
} else {
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < num_values; ++i) {
|
||||
if constexpr (is_filter) {
|
||||
if (!filter_data[i]) {
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1127,7 +1130,8 @@ Status OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
|
||||
}
|
||||
}
|
||||
}
|
||||
data_column->insert_many_strings(&string_values[0], string_values.size());
|
||||
data_column->insert_many_strings_overflow(&string_values[0], string_values.size(),
|
||||
max_value_length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -1938,11 +1942,12 @@ Status OrcReader::_convert_dict_cols_to_string_cols(
|
||||
const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
|
||||
const ColumnInt32* dict_column = assert_cast<const ColumnInt32*>(nested_column.get());
|
||||
DCHECK(dict_column);
|
||||
const NullMap& null_map = nullable_column->get_null_map_data();
|
||||
|
||||
MutableColumnPtr string_column;
|
||||
if (batch_vec != nullptr) {
|
||||
string_column = _convert_dict_column_to_string_column(
|
||||
dict_column, (*batch_vec)[orc_col_idx->second],
|
||||
dict_column, &null_map, (*batch_vec)[orc_col_idx->second],
|
||||
_col_orc_type[orc_col_idx->second]);
|
||||
} else {
|
||||
string_column = ColumnString::create();
|
||||
@ -1958,7 +1963,7 @@ Status OrcReader::_convert_dict_cols_to_string_cols(
|
||||
MutableColumnPtr string_column;
|
||||
if (batch_vec != nullptr) {
|
||||
string_column = _convert_dict_column_to_string_column(
|
||||
dict_column, (*batch_vec)[orc_col_idx->second],
|
||||
dict_column, nullptr, (*batch_vec)[orc_col_idx->second],
|
||||
_col_orc_type[orc_col_idx->second]);
|
||||
} else {
|
||||
string_column = ColumnString::create();
|
||||
@ -1971,12 +1976,15 @@ Status OrcReader::_convert_dict_cols_to_string_cols(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// TODO: Possible optimization points.
|
||||
// After filtering the dict column, the null_map for the null dict column should always not be null.
|
||||
// Then it can avoid checking null_map. However, currently when inert materialization is enabled,
|
||||
// the filter column will not be filtered first, but will be filtered together at the end.
|
||||
MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
|
||||
const ColumnInt32* dict_column, orc::ColumnVectorBatch* cvb,
|
||||
const ColumnInt32* dict_column, const NullMap* null_map, orc::ColumnVectorBatch* cvb,
|
||||
const orc::Type* orc_column_type) {
|
||||
SCOPED_RAW_TIMER(&_statistics.decode_value_time);
|
||||
auto res = ColumnString::create();
|
||||
const static std::string empty_string;
|
||||
auto* encoded_string_vector_batch = static_cast<orc::EncodedStringVectorBatch*>(cvb);
|
||||
DCHECK(encoded_string_vector_batch);
|
||||
std::vector<StringRef> string_values;
|
||||
@ -1984,11 +1992,12 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
|
||||
const int* dict_data = dict_column->get_data().data();
|
||||
string_values.reserve(num_values);
|
||||
size_t max_value_length = 0;
|
||||
auto* null_map_data = null_map->data();
|
||||
if (orc_column_type->getKind() == orc::TypeKind::CHAR) {
|
||||
// Possibly there are some zero padding characters in CHAR type, we have to strip them off.
|
||||
if (cvb->hasNulls) {
|
||||
if (null_map) {
|
||||
for (int i = 0; i < num_values; ++i) {
|
||||
if (cvb->notNull[i]) {
|
||||
if (!null_map_data[i]) {
|
||||
char* val_ptr;
|
||||
int64_t length;
|
||||
encoded_string_vector_batch->dictionary->getValueByIndex(dict_data[i], val_ptr,
|
||||
@ -2002,7 +2011,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
|
||||
// Orc doesn't fill null values in new batch, but the former batch has been release.
|
||||
// Other types like int/long/timestamp... are flat types without pointer in them,
|
||||
// so other types do not need to be handled separately like string.
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -2019,9 +2028,9 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (cvb->hasNulls) {
|
||||
if (null_map) {
|
||||
for (int i = 0; i < num_values; ++i) {
|
||||
if (cvb->notNull[i]) {
|
||||
if (!null_map_data[i]) {
|
||||
char* val_ptr;
|
||||
int64_t length;
|
||||
encoded_string_vector_batch->dictionary->getValueByIndex(dict_data[i], val_ptr,
|
||||
@ -2031,7 +2040,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
|
||||
}
|
||||
string_values.emplace_back(val_ptr, length);
|
||||
} else {
|
||||
string_values.emplace_back(empty_string.data(), 0);
|
||||
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -2047,7 +2056,7 @@ MutableColumnPtr OrcReader::_convert_dict_column_to_string_column(
|
||||
}
|
||||
}
|
||||
}
|
||||
res->insert_many_strings(&string_values[0], num_values);
|
||||
res->insert_many_strings_overflow(&string_values[0], num_values, max_value_length);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
@ -475,6 +475,7 @@ private:
|
||||
const std::vector<orc::ColumnVectorBatch*>* batch_vec);
|
||||
|
||||
MutableColumnPtr _convert_dict_column_to_string_column(const ColumnInt32* dict_column,
|
||||
const NullMap* null_map,
|
||||
orc::ColumnVectorBatch* cvb,
|
||||
const orc::Type* orc_column_typ);
|
||||
|
||||
|
||||
@ -29,8 +29,6 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
constexpr int MAX_STRINGS_OVERFLOW_SIZE = 128;
|
||||
|
||||
Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t length,
|
||||
size_t num_values) {
|
||||
_dict = std::move(dict);
|
||||
@ -48,7 +46,7 @@ Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t
|
||||
|
||||
_dict_value_to_code.reserve(num_values);
|
||||
// For insert_many_strings_overflow
|
||||
_dict_data.resize(total_length + MAX_STRINGS_OVERFLOW_SIZE);
|
||||
_dict_data.resize(total_length + ColumnString::MAX_STRINGS_OVERFLOW_SIZE);
|
||||
_max_value_length = 0;
|
||||
size_t offset = 0;
|
||||
offset_cursor = 0;
|
||||
|
||||
@ -2401,6 +2401,44 @@ centaurean/density 988
|
||||
google/boringssl 976
|
||||
woboq/woboq_codebrowser 916
|
||||
|
||||
-- !60 --
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
0000000000009c012a4fcfe3b1f1d683163b0768
|
||||
0000000000009c012a4fcfe3b1f1d683163b0768
|
||||
0000000001f2ffbd4641882b1b460e0ec21abc42
|
||||
000000076395383d824ec8a468a88ee2baa6f121
|
||||
000000076395383d824ec8a468a88ee2baa6f121
|
||||
0000000bab11354f9a759332065be5f066c3398f
|
||||
|
||||
-- !61 --
|
||||
fffffe630c2e9b0404d9f3838315e42284d033d6
|
||||
fffffa26fa725dbc5e01dd52f18eb63ae5575b8d
|
||||
fffff77486c5866ceef27c274849de55ec49e9c0
|
||||
fffff600c5b5272a9c40c2f11beb512429b57eed
|
||||
fffff427a6fe2f8d73c2ef137570ef93544a8ef5
|
||||
fffff23093876194fab64a3fdc182e2c63b49644
|
||||
fffff06aa80c33f247249df38a2b3cacecd51653
|
||||
ffffe8bc3f37233ebb90f51b68d96cad89d132e6
|
||||
ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
|
||||
ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
|
||||
|
||||
-- !62 --
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
|
||||
-- !63 --
|
||||
|
||||
-- !01 --
|
||||
Homebrew/homebrew-core 33 15
|
||||
pocoproject/poco 28 14
|
||||
@ -4803,3 +4841,40 @@ centaurean/density 988
|
||||
google/boringssl 976
|
||||
woboq/woboq_codebrowser 916
|
||||
|
||||
-- !60 --
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
00000000000004a9b86e267361898666ede4eaaa
|
||||
0000000000009c012a4fcfe3b1f1d683163b0768
|
||||
0000000000009c012a4fcfe3b1f1d683163b0768
|
||||
0000000001f2ffbd4641882b1b460e0ec21abc42
|
||||
000000076395383d824ec8a468a88ee2baa6f121
|
||||
000000076395383d824ec8a468a88ee2baa6f121
|
||||
0000000bab11354f9a759332065be5f066c3398f
|
||||
|
||||
-- !61 --
|
||||
fffffe630c2e9b0404d9f3838315e42284d033d6
|
||||
fffffa26fa725dbc5e01dd52f18eb63ae5575b8d
|
||||
fffff77486c5866ceef27c274849de55ec49e9c0
|
||||
fffff600c5b5272a9c40c2f11beb512429b57eed
|
||||
fffff427a6fe2f8d73c2ef137570ef93544a8ef5
|
||||
fffff23093876194fab64a3fdc182e2c63b49644
|
||||
fffff06aa80c33f247249df38a2b3cacecd51653
|
||||
ffffe8bc3f37233ebb90f51b68d96cad89d132e6
|
||||
ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
|
||||
ffffe3898bb6b0b4d2c5d162873d7687f57d8ca6
|
||||
|
||||
-- !62 --
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
\N
|
||||
|
||||
-- !63 --
|
||||
|
||||
@ -495,6 +495,11 @@ suite("test_external_github", "p2") {
|
||||
ORDER BY stars DESC
|
||||
LIMIT 50"""
|
||||
|
||||
def detail_test1 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id != "" order by commit_id asc limit 10"""
|
||||
def detail_test2 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id != "" order by commit_id desc limit 10"""
|
||||
def detail_test3 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id is null order by commit_id limit 10"""
|
||||
def detail_test4 = """SELECT /*+SET_VAR(exec_mem_limit=8589934592, query_timeout=600) */commit_id from github_eventsSUFFIX WHERE event_type = 'CommitCommentEvent' AND commit_id is null AND commit_id != "" order by commit_id limit 10"""
|
||||
|
||||
String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
|
||||
if (enabled != null && enabled.equalsIgnoreCase("true")) {
|
||||
String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
|
||||
@ -580,6 +585,10 @@ suite("test_external_github", "p2") {
|
||||
qt_57 whoAreAllThosePeopleGivingStars1.replace("SUFFIX", format)
|
||||
qt_58 whoAreAllThosePeopleGivingStars2.replace("SUFFIX", format)
|
||||
qt_59 whoAreAllThosePeopleGivingStars3.replace("SUFFIX", format)
|
||||
qt_60 detail_test1.replace("SUFFIX", format)
|
||||
qt_61 detail_test2.replace("SUFFIX", format)
|
||||
qt_62 detail_test3.replace("SUFFIX", format)
|
||||
qt_63 detail_test4.replace("SUFFIX", format)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user