From 9df72a96f39261b45edd15fd12db9546d7e7f321 Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Wed, 13 Sep 2023 00:20:12 +0800 Subject: [PATCH] [Feature](multi-catalog) Support hadoop viewfs. (#24168) ### Feature Support hadoop viewfs. ### Test - Regression tests: - hive viewfs test. - tvf viewfs test. - Broker load with broker and with hdfs tests manually. --- be/src/io/fs/hdfs_file_system.cpp | 8 +-- docs/en/docs/lakehouse/multi-catalog/hive.md | 22 +++++++ .../docs/lakehouse/multi-catalog/hive.md | 22 +++++++ .../apache/doris/analysis/StorageBackend.java | 6 +- .../org/apache/doris/common/FeConstants.java | 1 + .../apache/doris/fs/FileSystemFactory.java | 3 +- .../planner/external/FileQueryScanNode.java | 2 + .../doris/broker/hdfs/FileSystemManager.java | 3 +- .../hive/test_viewfs_hive.out | 35 +++++++++++ .../external_table_p2/tvf/test_tvf_p2.out | 3 + .../hive/test_viewfs_hive.groovy | 59 +++++++++++++++++++ .../external_table_p2/tvf/test_tvf_p2.groovy | 9 +++ 12 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 regression-test/data/external_table_p2/hive/test_viewfs_hive.out create mode 100644 regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 5294890324..1e71ade934 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -138,7 +138,7 @@ HdfsFileSystem::HdfsFileSystem(const THdfsParams& hdfs_params, const std::string _hdfs_params(hdfs_params), _fs_handle(nullptr), _profile(profile) { - if (_hdfs_params.__isset.fs_name) { + if (fs_name.empty() && _hdfs_params.__isset.fs_name) { _fs_name = _hdfs_params.fs_name; } else { _fs_name = fs_name; @@ -509,11 +509,7 @@ Status HdfsFileSystemCache::get_connection(const THdfsParams& hdfs_params, uint64 HdfsFileSystemCache::_hdfs_hash_code(const THdfsParams& hdfs_params, const std::string& fs_name) { uint64 hash_code = 0; - if (hdfs_params.__isset.fs_name) { - hash_code += Fingerprint(hdfs_params.fs_name); - } else { - hash_code += Fingerprint(fs_name); - } + hash_code += Fingerprint(fs_name); if (hdfs_params.__isset.user) { hash_code += Fingerprint(hdfs_params.user); } diff --git a/docs/en/docs/lakehouse/multi-catalog/hive.md b/docs/en/docs/lakehouse/multi-catalog/hive.md index 87e562ee88..afa388321c 100644 --- a/docs/en/docs/lakehouse/multi-catalog/hive.md +++ b/docs/en/docs/lakehouse/multi-catalog/hive.md @@ -95,6 +95,28 @@ Please place the `krb5.conf` file and `keytab` authentication file under all `BE The value of `hive.metastore.kerberos.principal` needs to be consistent with the property of the same name of the connected hive metastore, which can be obtained from `hive-site.xml`. +### Hive On VIEWFS + +```sql +CREATE CATALOG hive PROPERTIES ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://172.0.0.1:9083', + 'hadoop.username' = 'hive', + 'dfs.nameservices'='your-nameservice', + 'dfs.ha.namenodes.your-nameservice'='nn1,nn2', + 'dfs.namenode.rpc-address.your-nameservice.nn1'='172.21.0.2:8088', + 'dfs.namenode.rpc-address.your-nameservice.nn2'='172.21.0.3:8088', + 'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider', + 'fs.defaultFS' = 'viewfs://your-cluster', + 'fs.viewfs.mounttable.your-cluster.link./ns1' = 'hdfs://your-nameservice/', + 'fs.viewfs.mounttable.your-cluster.homedir' = '/ns1' +); +``` + +viewfs related parameters can be added to the catalog configuration as above, or added to `conf/core-site.xml`. + +How viewfs works and parameter configuration, please refer to relevant hadoop documents, for example, https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ViewFs.html + ### Hive On JuiceFS Data is stored in JuiceFS, examples are as follows: diff --git a/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md b/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md index 3242426184..e582d5e907 100644 --- a/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md +++ b/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md @@ -94,6 +94,28 @@ CREATE CATALOG hive PROPERTIES ( 请在所有的 `BE`、`FE` 节点下放置 `krb5.conf` 文件和 `keytab` 认证文件,`keytab` 认证文件路径和配置保持一致,`krb5.conf` 文件默认放置在 `/etc/krb5.conf` 路径。 `hive.metastore.kerberos.principal` 的值需要和所连接的 hive metastore 的同名属性保持一致,可从 `hive-site.xml` 中获取。 +### Hive On VIEWFS + +```sql +CREATE CATALOG hive PROPERTIES ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://172.0.0.1:9083', + 'hadoop.username' = 'hive', + 'dfs.nameservices'='your-nameservice', + 'dfs.ha.namenodes.your-nameservice'='nn1,nn2', + 'dfs.namenode.rpc-address.your-nameservice.nn1'='172.21.0.2:8088', + 'dfs.namenode.rpc-address.your-nameservice.nn2'='172.21.0.3:8088', + 'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider', + 'fs.defaultFS' = 'viewfs://your-cluster', + 'fs.viewfs.mounttable.your-cluster.link./ns1' = 'hdfs://your-nameservice/', + 'fs.viewfs.mounttable.your-cluster.homedir' = '/ns1' +); +``` + +viewfs 相关参数可以如上面一样添加到 catalog 配置中,也可以添加到 `conf/core-site.xml` 中。 + +viewfs 工作原理和参数配置可以参考 hadoop 相关文档,比如 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ViewFs.html + ### Hive On JuiceFS 数据存储在JuiceFS,示例如下: diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java index ada22ad301..f3d7f7e49f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java @@ -50,6 +50,7 @@ public class StorageBackend implements ParseNode { if (!schema.equalsIgnoreCase("bos") && !schema.equalsIgnoreCase("afs") && !schema.equalsIgnoreCase("hdfs") + && !schema.equalsIgnoreCase("viewfs") && !schema.equalsIgnoreCase("ofs") && !schema.equalsIgnoreCase("obs") && !schema.equalsIgnoreCase("oss") @@ -58,8 +59,9 @@ public class StorageBackend implements ParseNode { && !schema.equalsIgnoreCase("gfs") && !schema.equalsIgnoreCase("jfs") && !schema.equalsIgnoreCase("gs")) { - throw new AnalysisException("Invalid broker path. please use valid 'hdfs://', 'afs://' , 'bos://'," - + " 'ofs://', 'obs://', 'oss://', 's3a://', 'cosn://', 'gfs://', 'gs://' or 'jfs://' path."); + throw new AnalysisException("Invalid broker path. please use valid 'hdfs://', 'viewfs://', 'afs://'," + + " 'bos://', 'ofs://', 'obs://', 'oss://', 's3a://', 'cosn://', 'gfs://', 'gs://'" + + " or 'jfs://' path."); } } else if (type == StorageBackend.StorageType.S3 && !schema.equalsIgnoreCase("s3")) { throw new AnalysisException("Invalid export path. please use valid 's3://' path."); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java b/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java index f7fb0348dd..e9e853eb9f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java @@ -88,6 +88,7 @@ public class FeConstants { public static String FS_PREFIX_GFS = "gfs"; public static String FS_PREFIX_JFS = "jfs"; public static String FS_PREFIX_HDFS = "hdfs"; + public static String FS_PREFIX_VIEWFS = "viewfs"; public static String FS_PREFIX_FILE = "file"; public static final String INTERNAL_DB_NAME = "__internal_schema"; public static String TEMP_MATERIZLIZE_DVIEW_PREFIX = "internal_tmp_materialized_view_"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java b/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java index c2a070cdfa..3837a7eb95 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java @@ -65,7 +65,8 @@ public class FileSystemFactory { } else { fsType = FileSystemType.S3; } - } else if (location.startsWith(FeConstants.FS_PREFIX_HDFS) || location.startsWith(FeConstants.FS_PREFIX_GFS)) { + } else if (location.startsWith(FeConstants.FS_PREFIX_HDFS) || location.startsWith(FeConstants.FS_PREFIX_GFS) + || location.startsWith(FeConstants.FS_PREFIX_VIEWFS)) { fsType = FileSystemType.DFS; } else if (location.startsWith(FeConstants.FS_PREFIX_OFS) || location.startsWith(FeConstants.FS_PREFIX_COSN)) { // ofs:// and cosn:// use the same underlying file system: Tencent Cloud HDFS, aka CHDFS)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java index d59c6bec7d..00188dc50b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java @@ -484,6 +484,8 @@ public abstract class FileQueryScanNode extends FileScanNode { return Optional.of(TFileType.FILE_S3); } else if (location.startsWith(FeConstants.FS_PREFIX_HDFS)) { return Optional.of(TFileType.FILE_HDFS); + } else if (location.startsWith(FeConstants.FS_PREFIX_VIEWFS)) { + return Optional.of(TFileType.FILE_HDFS); } else if (location.startsWith(FeConstants.FS_PREFIX_COSN)) { return Optional.of(TFileType.FILE_HDFS); } else if (location.startsWith(FeConstants.FS_PREFIX_FILE)) { diff --git a/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java b/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java index d25947e33b..997cf0cb2c 100644 --- a/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java +++ b/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java @@ -66,6 +66,7 @@ public class FileSystemManager { .getLogger(FileSystemManager.class.getName()); // supported scheme private static final String HDFS_SCHEME = "hdfs"; + private static final String VIEWFS_SCHEME = "viewfs"; private static final String S3A_SCHEME = "s3a"; private static final String KS3_SCHEME = "ks3"; private static final String CHDFS_SCHEME = "ofs"; @@ -210,7 +211,7 @@ public class FileSystemManager { "invalid path. scheme is null"); } BrokerFileSystem brokerFileSystem = null; - if (scheme.equals(HDFS_SCHEME)) { + if (scheme.equals(HDFS_SCHEME) || scheme.equals(VIEWFS_SCHEME)) { brokerFileSystem = getDistributedFileSystem(path, properties); } else if (scheme.equals(S3A_SCHEME)) { brokerFileSystem = getS3AFileSystem(path, properties); diff --git a/regression-test/data/external_table_p2/hive/test_viewfs_hive.out b/regression-test/data/external_table_p2/hive/test_viewfs_hive.out new file mode 100644 index 0000000000..37e1df0ba0 --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_viewfs_hive.out @@ -0,0 +1,35 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !viewfs -- +1 Tom 48 \N male +2 Jerry 35 \N male +3 Frank 25 \N male +4 Ada 22 \N female + +-- !viewfs_partition1 -- +1 Tom 48 \N male 20230101 +2 Jerry 35 \N male 20230101 +3 Frank 25 \N male 20230201 +4 Ada 22 \N female 20230201 + +-- !viewfs_partition2 -- +1 Tom 48 \N male 20230101 +2 Jerry 35 \N male 20230101 + +-- !viewfs_partition3 -- +3 Frank 25 \N male 20230201 +4 Ada 22 \N female 20230201 + +-- !viewfs_mixed_partition1 -- +1 Tom 48 \N male 20230101 +2 Jerry 35 \N male 20230101 +3 Frank 25 \N male 20230201 +4 Ada 22 \N female 20230201 + +-- !viewfs_mixed_partition2 -- +1 Tom 48 \N male 20230101 +2 Jerry 35 \N male 20230101 + +-- !viewfs_mixed_partition3 -- +3 Frank 25 \N male 20230201 +4 Ada 22 \N female 20230201 + diff --git a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out index cb7239f2ff..86f3f43f2d 100644 --- a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out +++ b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out @@ -42,3 +42,6 @@ -- !row_cross_pages -- 25001 25001 25001 +-- !viewfs -- +25001 25001 25001 + diff --git a/regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy b/regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy new file mode 100644 index 0000000000..9ccea773ed --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_viewfs_hive", "p2,external,hive,external_remote,external_remote_hive") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String nameNodeHost = context.config.otherConfigs.get("extHiveHmsHost") + String hdfsPort = context.config.otherConfigs.get("extHdfsPort") + String catalog_name = "test_viewfs_hive" + + sql """drop catalog if exists ${catalog_name};""" + + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}', + 'fs.viewfs.mounttable.my-cluster.link./ns1' = 'hdfs://${nameNodeHost}:${hdfsPort}/', + 'fs.viewfs.mounttable.my-cluster.homedir' = '/ns1', + 'fs.defaultFS' = 'viewfs://my-cluster' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """ use viewfs """ + + // The location of table is on viewfs. + qt_viewfs """ select * from test_viewfs order by id""" + + // The location of partition table is on viewfs. + qt_viewfs_partition1 """ select * from test_viewfs_partition order by id""" + qt_viewfs_partition2 """ select * from test_viewfs_partition where part_col = 20230101 order by id""" + qt_viewfs_partition3 """ select * from test_viewfs_partition where part_col = 20230201 order by id""" + + // The location of partition table contains hdfs and viewfs locations partitions. + qt_viewfs_mixed_partition1 """ select * from test_viewfs_mixed_partition order by id""" + qt_viewfs_mixed_partition2 """ select * from test_viewfs_mixed_partition where part_col = 20230101 order by id""" + qt_viewfs_mixed_partition3 """ select * from test_viewfs_mixed_partition where part_col = 20230201 order by id""" + } +} diff --git a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy index ec3fb41d95..853b5d2f4d 100644 --- a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy +++ b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy @@ -53,5 +53,14 @@ suite("test_tvf_p2", "p2,external,tvf,external_remote,external_remote_tvf") { "uri" = "hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/row_cross_pages.parquet", "format" = "parquet", "fs.defaultFS" = "hdfs://${nameNodeHost}:${hdfsPort}")""" + + // viewfs + qt_viewfs """select count(id), count(m1), count(m2) + from hdfs( + "uri" = "viewfs://my-cluster/ns1/catalog/tvf/parquet/row_cross_pages.parquet", + "format" = "parquet", + "fs.defaultFS" = "viewfs://my-cluster", + "fs.viewfs.mounttable.my-cluster.link./ns1" = "hdfs://${nameNodeHost}:${hdfsPort}/", + "fs.viewfs.mounttable.my-cluster.homedir" = "/ns1")""" } }