branch-2.1: [fix](be) fix parquet file reader not updating page index when processing it #52228 (#52782)

Cherry-picked from #52228

Co-authored-by: SWEI <z_swii@outlook.com>
Co-authored-by: zengsiwei <zengsiwei@kuaishou.com>
Co-authored-by: suxiaogang223 <suxiaogang223@icloud.com>
This commit is contained in:
github-actions[bot]
2025-07-09 10:58:48 +08:00
committed by GitHub
parent 1775200cba
commit 6c5ec15e10
5 changed files with 17 additions and 3 deletions

View File

@ -1345,7 +1345,7 @@ DEFINE_mInt64(compaction_batch_size, "-1");
// If set to false, the parquet reader will not use page index to filter data.
// This is only for debug purpose, in case sometimes the page index
// filter wrong data.
DEFINE_mBool(enable_parquet_page_index, "false");
DEFINE_mBool(enable_parquet_page_index, "true");
DEFINE_mBool(ignore_not_found_file_in_external_table, "true");

View File

@ -860,7 +860,7 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group,
// use the union row range
skipped_row_ranges.emplace_back(skipped_row_range);
}
_col_offsets.emplace(parquet_col_id, offset_index);
_col_offsets[parquet_col_id] = offset_index;
}
if (skipped_row_ranges.empty()) {
read_whole_row_group();

View File

@ -1,4 +1,8 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !small_2rowgroup --
1024
4049
-- !q01 --
1 2132 4633 4 28.00 28955.64 0.09 0.06 N O 1996-04-21 1996-03-30 1996-05-16 NONE AIR lites. fluffily even de
1 15635 638 6 32.00 49620.16 0.07 0.02 N O 1996-01-30 1996-02-07 1996-02-03 DELIVER IN PERSON MAIL arefully slyly ex

View File

@ -94,11 +94,21 @@ suite("test_hive_parquet_skip_page", "p0,external,hive,external_docker,external_
return;
}
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
def hdfs_port = context.config.otherConfigs.get("hive2HdfsPort")
def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
def hdfsUserName = "doris"
def uri = "${defaultFS}" + "/user/doris/preinstalled_data/parquet/small_2rowgroup.parquet"
qt_small_2rowgroup """ select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "parquet") where a = 1024 or a = 4049
order by a;"""
for (String hivePrefix : ["hive2", "hive3"]) {
try {
String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")
String catalog_name = "${hivePrefix}_test_parquet_skip_page"
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
sql """drop catalog if exists ${catalog_name}"""
sql """create catalog if not exists ${catalog_name} properties (