From 6c48eccc3f00a120f66e8b89e9e0618da9fda576 Mon Sep 17 00:00:00 2001 From: wjhh2008 Date: Mon, 14 Oct 2024 06:14:08 +0000 Subject: [PATCH] [CP] fix read decimal from parquet file crash when precision <= 9 --- .../table/ob_parquet_table_row_iter.cpp | 38 ++++++++++++------- .../engine/table/ob_parquet_table_row_iter.h | 3 ++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/sql/engine/table/ob_parquet_table_row_iter.cpp b/src/sql/engine/table/ob_parquet_table_row_iter.cpp index a0f2d7d2ef..bab295fb9b 100644 --- a/src/sql/engine/table/ob_parquet_table_row_iter.cpp +++ b/src/sql/engine/table/ob_parquet_table_row_iter.cpp @@ -406,7 +406,7 @@ int ObParquetTableRowIterator::next_row_group() { int ret = OB_SUCCESS; //init all meta - if (OB_SUCC(ret) && state_.cur_row_group_idx_ > state_.end_row_group_idx_) { + while (OB_SUCC(ret) && state_.cur_row_group_idx_ > state_.end_row_group_idx_) { if (OB_FAIL(next_file())) { if (OB_ITER_END != ret) { LOG_WARN("fail to next row group", K(ret)); @@ -707,19 +707,27 @@ int ObParquetTableRowIterator::DataLoader::to_numeric_hive( LOG_WARN("overflow", K(length), K(data_len)); } else { //to little endian - MEMSET(buf, (*str >> 8), data_len); - int64_t pos = 0; - int64_t temp_len = length; - while (temp_len >= 8) { - uint64_t temp_v = *(pointer_cast(str + temp_len - 8)); - *(pointer_cast(buf + pos)) = ntohll(temp_v); - pos+=8; - temp_len-=8; - } - if (temp_len > 0) { - MEMCPY(buf + pos + 8 - temp_len, str, temp_len); - uint64_t temp_v = *(pointer_cast(buf + pos)); - *(pointer_cast(buf + pos)) = ntohll(temp_v); + MEMSET(buf, (*str >> 8), data_len); // fill 1 when the input value is negetive, otherwise fill 0 + if (data_len <= 4) { + //for precision <= 9 + MEMCPY(buf + 4 - length, str, length); + uint32_t *res = pointer_cast(buf); + uint32_t temp_v = *res; + *res = ntohl(temp_v); + } else { + int64_t pos = 0; + int64_t temp_len = length; + while (temp_len >= 8) { + uint64_t temp_v = *(pointer_cast(str + temp_len - 8)); + *(pointer_cast(buf + pos)) = ntohll(temp_v); + pos+=8; + temp_len-=8; + } + if (temp_len > 0) { + MEMCPY(buf + pos + 8 - temp_len, str, temp_len); + uint64_t temp_v = *(pointer_cast(buf + pos)); + *(pointer_cast(buf + pos)) = ntohll(temp_v); + } } decint = pointer_cast(buf); val_len = static_cast(data_len); @@ -1402,6 +1410,8 @@ int ObParquetTableRowIterator::get_next_rows(int64_t &count, int64_t capacity) if (OB_SUCC(ret)) { state_.cur_row_group_read_row_count_ += read_count; count = read_count; + } else { + LOG_WARN("fail to get next rows from parquet file", K(ret), K(state_)); } return ret; } diff --git a/src/sql/engine/table/ob_parquet_table_row_iter.h b/src/sql/engine/table/ob_parquet_table_row_iter.h index e37d6f551b..6f66862206 100644 --- a/src/sql/engine/table/ob_parquet_table_row_iter.h +++ b/src/sql/engine/table/ob_parquet_table_row_iter.h @@ -126,6 +126,9 @@ public: int64_t cur_line_number_; ObString cur_file_url_; ObNewRow part_list_val_; + TO_STRING_KV(K(file_idx_), K(part_id_), K(row_group_idx_), K(cur_file_id_), K(cur_row_group_idx_), + K(end_row_group_idx_), K(cur_row_group_read_row_count_), K(cur_row_group_row_count_), + K(cur_line_number_), K(cur_file_url_)); }; public: ObParquetTableRowIterator() :