[Optimize](parquet-reader) Opt by filtering null count statistics in rowgroup and page level. (#19106)

Issue Number: About #19038, we found in this case, l_orderkey has many nulls,
so we can filter it by null count statistics in the row group and page level,
then it can improve a lot of performance in this case.
This commit is contained in:
Qi Chen
2023-04-27 21:21:30 +08:00
committed by GitHub
parent 95d91e7010
commit e4f7d77c5c
3 changed files with 25 additions and 10 deletions

View File

@ -369,13 +369,22 @@ private:
}
public:
static bool filter_by_min_max(const ColumnValueRangeType& col_val_range,
const FieldSchema* col_schema, const std::string& encoded_min,
const std::string& encoded_max, const cctz::time_zone& ctz) {
static bool filter_by_stats(const ColumnValueRangeType& col_val_range,
const FieldSchema* col_schema, bool is_set_min_max,
const std::string& encoded_min, const std::string& encoded_max,
bool is_all_null, const cctz::time_zone& ctz) {
bool need_filter = false;
std::visit(
[&](auto&& range) {
std::vector<ScanPredicate> filters = _value_range_to_predicate(range);
// Currently, ScanPredicate doesn't include "is null" && "x = null", filters will be empty when contains these exprs.
// So we can handle is_all_null safely.
if (!filters.empty()) {
need_filter = is_all_null;
if (need_filter) {
return;
}
}
for (auto& filter : filters) {
need_filter |= _filter_by_min_max(range, filter, col_schema, encoded_min,
encoded_max, ctz);

View File

@ -67,9 +67,10 @@ Status PageIndex::collect_skipped_page_range(tparquet::ColumnIndex* column_index
const int num_of_pages = column_index->null_pages.size();
for (int page_id = 0; page_id < num_of_pages; page_id++) {
if (ParquetPredicate::filter_by_min_max(col_val_range, col_schema,
encoded_min_vals[page_id],
encoded_max_vals[page_id], ctz)) {
bool is_all_null = column_index->null_pages[page_id];
if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, !is_all_null,
encoded_min_vals[page_id], encoded_max_vals[page_id],
is_all_null, ctz)) {
skipped_ranges.emplace_back(page_id);
}
}

View File

@ -801,14 +801,19 @@ Status ParquetReader::_process_column_stat_filter(const std::vector<tparquet::Co
continue;
}
int parquet_col_id = col_iter->second;
auto& statistic = columns[parquet_col_id].meta_data.statistics;
if (!statistic.__isset.max || !statistic.__isset.min) {
auto& meta_data = columns[parquet_col_id].meta_data;
auto& statistic = meta_data.statistics;
bool is_all_null =
(statistic.__isset.null_count && statistic.null_count == meta_data.num_values);
bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min);
if ((!is_set_min_max) && (!is_all_null)) {
continue;
}
const FieldSchema* col_schema = schema_desc.get_column(col_name);
// Min-max of statistic is plain-encoded value
*filter_group = ParquetPredicate::filter_by_min_max(slot_iter->second, col_schema,
statistic.min, statistic.max, *_ctz);
*filter_group =
ParquetPredicate::filter_by_stats(slot_iter->second, col_schema, is_set_min_max,
statistic.min, statistic.max, is_all_null, *_ctz);
if (*filter_group) {
break;
}