From e42465ae5949ff020aa70fab7143cbae0e2d4609 Mon Sep 17 00:00:00 2001 From: Ashin Gau Date: Sun, 26 Feb 2023 08:10:40 +0800 Subject: [PATCH] [fix](OrcReader) handle null values in orc reader for string type (#17135) Orc doesn't fill null values in new batch, but the former batch has been release. Other types like int/long/timestamp... are flat types without pointer in them, so other types do not need to be handled separately like string. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index c124aa1b8a..a31f7c152c 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -664,6 +664,7 @@ Status OrcReader::_decode_string_column(const std::string& col_name, const orc::TypeKind& type_kind, orc::ColumnVectorBatch* cvb, size_t num_values) { SCOPED_RAW_TIMER(&_statistics.decode_value_time); + const static std::string empty_string; auto* data = down_cast(cvb); if (data == nullptr) { return Status::InternalError("Wrong data type for colum '{}'", col_name); @@ -673,11 +674,23 @@ Status OrcReader::_decode_string_column(const std::string& col_name, if (type_kind == orc::TypeKind::CHAR) { // Possibly there are some zero padding characters in CHAR type, we have to strip them off. for (int i = 0; i < num_values; ++i) { - string_values.emplace_back(data->data[i], trim_right(data->data[i], data->length[i])); + if (cvb->notNull[0]) { + string_values.emplace_back(data->data[i], + trim_right(data->data[i], data->length[i])); + } else { + // Orc doesn't fill null values in new batch, but the former batch has been release. + // Other types like int/long/timestamp... are flat types without pointer in them, + // so other types do not need to be handled separately like string. + string_values.emplace_back(empty_string.data(), 0); + } } } else { for (int i = 0; i < num_values; ++i) { - string_values.emplace_back(data->data[i], data->length[i]); + if (cvb->notNull[0]) { + string_values.emplace_back(data->data[i], data->length[i]); + } else { + string_values.emplace_back(empty_string.data(), 0); + } } } data_column->insert_many_strings(&string_values[0], num_values);