From 586df24b9d25fc7ec4cbe679257afbaa04d4afc0 Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Wed, 3 Apr 2024 16:26:13 +0800 Subject: [PATCH] [fix](tvf) Support fs.defaultFS with postfix '/' (#33202) For HDFS tvf like: ``` select count(*) from hdfs( "uri" = "hdfs://HDFS8000871/path/to/1.parquet", "fs.defaultFS" = "hdfs://HDFS8000871/", "format" = "parquet" ); ``` Before, if the `fs.defaultFS` is end with `/`, the query will fail with error like: ``` reason: RemoteException: File does not exist: /user/doris/path/to/1.parquet ``` You can see that is a wrong path with wrong prefix `/user/doris` User need to set `fs.defaultFS` to `hdfs://HDFS8000871` to avoid this error. This PR fix this issue --- be/src/util/hdfs_util.cpp | 5 +++++ .../parquet/test_hive_read_parquet_comlex_type.groovy | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/be/src/util/hdfs_util.cpp b/be/src/util/hdfs_util.cpp index f82b51cb27..6e99fdea3d 100644 --- a/be/src/util/hdfs_util.cpp +++ b/be/src/util/hdfs_util.cpp @@ -45,6 +45,11 @@ Path convert_path(const Path& path, const std::string& namenode) { Path real_path(path); if (path.string().find(namenode) != std::string::npos) { std::string real_path_str = path.string().substr(namenode.size()); + if (!real_path_str.starts_with("/")) { + // The real path must starts with "/" + // Or the hadoop client will add a prefix like "/user/hadoop". + real_path_str = "/" + real_path_str; + } real_path = real_path_str; } return real_path; diff --git a/regression-test/suites/external_table_p0/export/hive_read/parquet/test_hive_read_parquet_comlex_type.groovy b/regression-test/suites/external_table_p0/export/hive_read/parquet/test_hive_read_parquet_comlex_type.groovy index 382906a1bb..e4e80d74b5 100644 --- a/regression-test/suites/external_table_p0/export/hive_read/parquet/test_hive_read_parquet_comlex_type.groovy +++ b/regression-test/suites/external_table_p0/export/hive_read/parquet/test_hive_read_parquet_comlex_type.groovy @@ -41,6 +41,7 @@ suite("test_hive_read_parquet_complex_type", "external,hive,external_docker") { def hdfsUserName = "doris" def format = "parquet" def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" + def defaultFS_with_postfix = "hdfs://${externalEnvIp}:${hdfs_port}/" def outfile_path = "/user/doris/tmp_data" def uri = "${defaultFS}" + "${outfile_path}/exp_" @@ -99,7 +100,6 @@ suite("test_hive_read_parquet_complex_type", "external,hive,external_docker") { INTO OUTFILE "${uri}" FORMAT AS ${format} PROPERTIES ( - "fs.defaultFS"="${defaultFS}", "hadoop.username" = "${hdfsUserName}" ); """ @@ -147,6 +147,7 @@ suite("test_hive_read_parquet_complex_type", "external,hive,external_docker") { qt_select_tvf1 """ select * from HDFS( "uri" = "${outfile_url}0.parquet", + "fs.defaultFS" = "${defaultFS_with_postfix}", "hadoop.username" = "${hdfsUserName}", "format" = "${format}"); """ @@ -185,6 +186,7 @@ suite("test_hive_read_parquet_complex_type", "external,hive,external_docker") { qt_select_tvf2 """ select * from HDFS( "uri" = "${outfile_url}0.parquet", + "fs.defaultFS" = "${defaultFS}", "hadoop.username" = "${hdfsUserName}", "format" = "${format}"); """