diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index fa8e5e9272..640088a742 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1345,7 +1345,7 @@ DEFINE_mInt64(compaction_batch_size, "-1"); // If set to false, the parquet reader will not use page index to filter data. // This is only for debug purpose, in case sometimes the page index // filter wrong data. -DEFINE_mBool(enable_parquet_page_index, "false"); +DEFINE_mBool(enable_parquet_page_index, "true"); DEFINE_mBool(ignore_not_found_file_in_external_table, "true"); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index e51d27ce20..d14d77f70c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -860,7 +860,7 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group, // use the union row range skipped_row_ranges.emplace_back(skipped_row_range); } - _col_offsets.emplace(parquet_col_id, offset_index); + _col_offsets[parquet_col_id] = offset_index; } if (skipped_row_ranges.empty()) { read_whole_row_group(); diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet new file mode 100644 index 0000000000..dcd05f5e28 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet differ diff --git a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out index 6c869dbc78..a8973479e4 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out +++ b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out @@ -1,4 +1,8 @@ -- This file is automatically generated. You should know what you did if you want to edit this +-- !small_2rowgroup -- +1024 +4049 + -- !q01 -- 1 2132 4633 4 28.00 28955.64 0.09 0.06 N O 1996-04-21 1996-03-30 1996-05-16 NONE AIR lites. fluffily even de 1 15635 638 6 32.00 49620.16 0.07 0.02 N O 1996-01-30 1996-02-07 1996-02-03 DELIVER IN PERSON MAIL arefully slyly ex diff --git a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy index ebdbedf139..0bded7d820 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy @@ -94,11 +94,21 @@ suite("test_hive_parquet_skip_page", "p0,external,hive,external_docker,external_ return; } + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + def hdfs_port = context.config.otherConfigs.get("hive2HdfsPort") + def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" + def hdfsUserName = "doris" + def uri = "${defaultFS}" + "/user/doris/preinstalled_data/parquet/small_2rowgroup.parquet" + qt_small_2rowgroup """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "parquet") where a = 1024 or a = 4049 + order by a;""" + for (String hivePrefix : ["hive2", "hive3"]) { try { String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") String catalog_name = "${hivePrefix}_test_parquet_skip_page" - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") sql """drop catalog if exists ${catalog_name}""" sql """create catalog if not exists ${catalog_name} properties (