[Optimize](parquet-reader) Opt by filtering null count statistics in rowgroup and page level. (#19106)
Issue Number: About #19038, we found in this case, l_orderkey has many nulls, so we can filter it by null count statistics in the row group and page level, then it can improve a lot of performance in this case.
This commit is contained in:
@ -369,13 +369,22 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
static bool filter_by_min_max(const ColumnValueRangeType& col_val_range,
|
||||
const FieldSchema* col_schema, const std::string& encoded_min,
|
||||
const std::string& encoded_max, const cctz::time_zone& ctz) {
|
||||
static bool filter_by_stats(const ColumnValueRangeType& col_val_range,
|
||||
const FieldSchema* col_schema, bool is_set_min_max,
|
||||
const std::string& encoded_min, const std::string& encoded_max,
|
||||
bool is_all_null, const cctz::time_zone& ctz) {
|
||||
bool need_filter = false;
|
||||
std::visit(
|
||||
[&](auto&& range) {
|
||||
std::vector<ScanPredicate> filters = _value_range_to_predicate(range);
|
||||
// Currently, ScanPredicate doesn't include "is null" && "x = null", filters will be empty when contains these exprs.
|
||||
// So we can handle is_all_null safely.
|
||||
if (!filters.empty()) {
|
||||
need_filter = is_all_null;
|
||||
if (need_filter) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (auto& filter : filters) {
|
||||
need_filter |= _filter_by_min_max(range, filter, col_schema, encoded_min,
|
||||
encoded_max, ctz);
|
||||
|
||||
@ -67,9 +67,10 @@ Status PageIndex::collect_skipped_page_range(tparquet::ColumnIndex* column_index
|
||||
|
||||
const int num_of_pages = column_index->null_pages.size();
|
||||
for (int page_id = 0; page_id < num_of_pages; page_id++) {
|
||||
if (ParquetPredicate::filter_by_min_max(col_val_range, col_schema,
|
||||
encoded_min_vals[page_id],
|
||||
encoded_max_vals[page_id], ctz)) {
|
||||
bool is_all_null = column_index->null_pages[page_id];
|
||||
if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, !is_all_null,
|
||||
encoded_min_vals[page_id], encoded_max_vals[page_id],
|
||||
is_all_null, ctz)) {
|
||||
skipped_ranges.emplace_back(page_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -801,14 +801,19 @@ Status ParquetReader::_process_column_stat_filter(const std::vector<tparquet::Co
|
||||
continue;
|
||||
}
|
||||
int parquet_col_id = col_iter->second;
|
||||
auto& statistic = columns[parquet_col_id].meta_data.statistics;
|
||||
if (!statistic.__isset.max || !statistic.__isset.min) {
|
||||
auto& meta_data = columns[parquet_col_id].meta_data;
|
||||
auto& statistic = meta_data.statistics;
|
||||
bool is_all_null =
|
||||
(statistic.__isset.null_count && statistic.null_count == meta_data.num_values);
|
||||
bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min);
|
||||
if ((!is_set_min_max) && (!is_all_null)) {
|
||||
continue;
|
||||
}
|
||||
const FieldSchema* col_schema = schema_desc.get_column(col_name);
|
||||
// Min-max of statistic is plain-encoded value
|
||||
*filter_group = ParquetPredicate::filter_by_min_max(slot_iter->second, col_schema,
|
||||
statistic.min, statistic.max, *_ctz);
|
||||
*filter_group =
|
||||
ParquetPredicate::filter_by_stats(slot_iter->second, col_schema, is_set_min_max,
|
||||
statistic.min, statistic.max, is_all_null, *_ctz);
|
||||
if (*filter_group) {
|
||||
break;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user