diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h index 60001f649e..8d3057312b 100644 --- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h +++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h @@ -369,13 +369,22 @@ private: } public: - static bool filter_by_min_max(const ColumnValueRangeType& col_val_range, - const FieldSchema* col_schema, const std::string& encoded_min, - const std::string& encoded_max, const cctz::time_zone& ctz) { + static bool filter_by_stats(const ColumnValueRangeType& col_val_range, + const FieldSchema* col_schema, bool is_set_min_max, + const std::string& encoded_min, const std::string& encoded_max, + bool is_all_null, const cctz::time_zone& ctz) { bool need_filter = false; std::visit( [&](auto&& range) { std::vector filters = _value_range_to_predicate(range); + // Currently, ScanPredicate doesn't include "is null" && "x = null", filters will be empty when contains these exprs. + // So we can handle is_all_null safely. + if (!filters.empty()) { + need_filter = is_all_null; + if (need_filter) { + return; + } + } for (auto& filter : filters) { need_filter |= _filter_by_min_max(range, filter, col_schema, encoded_min, encoded_max, ctz); diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp index bad05728eb..35cf076318 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp @@ -67,9 +67,10 @@ Status PageIndex::collect_skipped_page_range(tparquet::ColumnIndex* column_index const int num_of_pages = column_index->null_pages.size(); for (int page_id = 0; page_id < num_of_pages; page_id++) { - if (ParquetPredicate::filter_by_min_max(col_val_range, col_schema, - encoded_min_vals[page_id], - encoded_max_vals[page_id], ctz)) { + bool is_all_null = column_index->null_pages[page_id]; + if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, !is_all_null, + encoded_min_vals[page_id], encoded_max_vals[page_id], + is_all_null, ctz)) { skipped_ranges.emplace_back(page_id); } } diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index efe81a953f..388dbcf838 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -801,14 +801,19 @@ Status ParquetReader::_process_column_stat_filter(const std::vectorsecond; - auto& statistic = columns[parquet_col_id].meta_data.statistics; - if (!statistic.__isset.max || !statistic.__isset.min) { + auto& meta_data = columns[parquet_col_id].meta_data; + auto& statistic = meta_data.statistics; + bool is_all_null = + (statistic.__isset.null_count && statistic.null_count == meta_data.num_values); + bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min); + if ((!is_set_min_max) && (!is_all_null)) { continue; } const FieldSchema* col_schema = schema_desc.get_column(col_name); // Min-max of statistic is plain-encoded value - *filter_group = ParquetPredicate::filter_by_min_max(slot_iter->second, col_schema, - statistic.min, statistic.max, *_ctz); + *filter_group = + ParquetPredicate::filter_by_stats(slot_iter->second, col_schema, is_set_min_max, + statistic.min, statistic.max, is_all_null, *_ctz); if (*filter_group) { break; }