From 994db8b400bf72805e2e32097a9fb00d968462ad Mon Sep 17 00:00:00 2001 From: slothever <18522955+wsjz@users.noreply.github.com> Date: Tue, 31 Oct 2023 17:09:52 +0800 Subject: [PATCH] [fix](multi-catalog)add the FAQ for Aliyun DLF and add the fs.xx.impl check (#25594) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. add the FAQ for Aliyun DLF,include the DLF configuration and Jindo SDK locations 2. add the fs.xx.impl check when load cache files --- docs/en/docs/lakehouse/faq.md | 8 +++++ docs/en/docs/lakehouse/multi-catalog/dlf.md | 32 +++++++++-------- docs/zh-CN/docs/lakehouse/faq.md | 6 ++++ .../zh-CN/docs/lakehouse/multi-catalog/dlf.md | 34 ++++++++++--------- .../datasource/hive/HiveMetaStoreCache.java | 6 ++-- .../property/PropertyConverter.java | 7 ++-- 6 files changed, 57 insertions(+), 36 deletions(-) diff --git a/docs/en/docs/lakehouse/faq.md b/docs/en/docs/lakehouse/faq.md index 55e21a106d..a6c97cfbb6 100644 --- a/docs/en/docs/lakehouse/faq.md +++ b/docs/en/docs/lakehouse/faq.md @@ -87,6 +87,8 @@ under the License. Add `-Djava.security.krb5.conf=/your-path` to the `JAVA_OPTS` of the broker startup script `start_broker.sh`. +8. When using Kerberos configuration in the Catalog, the `hadoop.username` property cannot be appeared in Catalog properties. + ## JDBC Catalog 1. An error is reported when connecting to SQLServer through JDBC Catalog: `unable to find valid certification path to requested target` @@ -267,4 +269,10 @@ under the License. Note that the value here is the cumulative value of a single HDFS Client, not the value of a single query. The same HDFS Client will be reused by multiple queries. +## DLF Catalog +1. When using DLF Catalog, BE reads `Invalid address` when fetching JindoFS data and needs to add the domain name to IP mapping that appears in the log in `/ets/hosts`. + +2. When reading data is not authorized, use the `hadoop.username` property to specify the authorized user. + +3. The metadata in the DLF Catalog is consistent with the DLF. When DLF is used to manage metadata, newly imported Hive partitions may not be synchronized by DLF, resulting in inconsistency between the DLF and Hive metadata. In this case, ensure firstly that the Hive metadata is fully synchronized by DLF. diff --git a/docs/en/docs/lakehouse/multi-catalog/dlf.md b/docs/en/docs/lakehouse/multi-catalog/dlf.md index 763fa9fdd8..7fb88bb316 100644 --- a/docs/en/docs/lakehouse/multi-catalog/dlf.md +++ b/docs/en/docs/lakehouse/multi-catalog/dlf.md @@ -67,23 +67,25 @@ Doris supports accessing Hive/Iceberg/Hudi metadata in DLF. ### Use OSS-HDFS as the datasource 1. Enable OSS-HDFS. [Grant access to OSS or OSS-HDFS](https://www.alibabacloud.com/help/en/e-mapreduce/latest/oss-hdfsnew) -2. Download the SDK. [JindoData SDK](https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/5.x/5.0.0-beta7/jindodata_download.md) -3. Decompress the jindosdk.tar.gz, and then enter its lib directory and put `jindo-core.jar, jindo-sdk.jar` to both `${DORIS_HOME}/fe/lib` and `${DORIS_HOME}/be/lib/java_extensions`. +2. Download the SDK. [JindoData SDK](https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/5.x/5.0.0-beta7/jindodata_download.md). If the Jindo SDK directory already exists on the cluster, skip this step. +3. Decompress the jindosdk.tar.gz or locate the Jindo SDK directory on the cluster, and then enter its lib directory and put `jindo-core.jar, jindo-sdk.jar` to both `${DORIS_HOME}/fe/lib` and `${DORIS_HOME}/be/lib/java_extensions/preload-extensions`. 4. Create DLF Catalog, set `oss.hdfs.enabled` as `true`: -```sql -CREATE CATALOG dlf_oss_hdfs PROPERTIES ( - "type"="hms", - "hive.metastore.type" = "dlf", - "dlf.proxy.mode" = "DLF_ONLY", - "dlf.endpoint" = "datalake-vpc.cn-beijing.aliyuncs.com", - "dlf.region" = "cn-beijing", - "dlf.uid" = "uid", - "dlf.access_key" = "ak", - "dlf.secret_key" = "sk", - "oss.hdfs.enabled" = "true" -); -``` + ```sql + CREATE CATALOG dlf_oss_hdfs PROPERTIES ( + "type"="hms", + "hive.metastore.type" = "dlf", + "dlf.proxy.mode" = "DLF_ONLY", + "dlf.endpoint" = "datalake-vpc.cn-beijing.aliyuncs.com", + "dlf.region" = "cn-beijing", + "dlf.uid" = "uid", + "dlf.access_key" = "ak", + "dlf.secret_key" = "sk", + "oss.hdfs.enabled" = "true" + ); + ``` + +5. When the Jindo SDK version is inconsistent with the version used on the EMR cluster, will reported `Plugin not found` and the Jindo SDK needs to be replaced with the corresponding version. ### DLF Iceberg Catalog diff --git a/docs/zh-CN/docs/lakehouse/faq.md b/docs/zh-CN/docs/lakehouse/faq.md index 6e3df6dade..c1ab720e9e 100644 --- a/docs/zh-CN/docs/lakehouse/faq.md +++ b/docs/zh-CN/docs/lakehouse/faq.md @@ -84,6 +84,7 @@ under the License. 将 `-Djava.security.krb5.conf=/your-path` 配置项添加到Broker Load启动脚本的 `start_broker.sh` 的 `JAVA_OPTS`里。 +8. 当在Catalog里使用Kerberos配置时,不能同时使用`hadoop.username`属性。 ## JDBC Catalog @@ -263,5 +264,10 @@ under the License. 注意,这里的值是单个 HDFS Client 的累计值,而不是单个查询的数值。同一个 HDFS Client 会被多个查询复用。 +## DLF Catalog +1. 使用DLF Catalog时,BE读在取JindoFS数据出现`Invalid address`,需要在`/ets/hosts`中添加日志中出现的域名到IP的映射。 +2. 读取数据无权限时,使用`hadoop.username`属性指定有权限的用户。 + +3. DLF Catalog中的元数据和DLF保持一致。当使用DLF管理元数据时,Hive新导入的分区,可能未被DLF同步,导致出现DLF和Hive元数据不一致的情况,对此,需要先保证Hive元数据被DLF完全同步。 diff --git a/docs/zh-CN/docs/lakehouse/multi-catalog/dlf.md b/docs/zh-CN/docs/lakehouse/multi-catalog/dlf.md index 822ecff1bb..43d30ff5e6 100644 --- a/docs/zh-CN/docs/lakehouse/multi-catalog/dlf.md +++ b/docs/zh-CN/docs/lakehouse/multi-catalog/dlf.md @@ -66,24 +66,26 @@ CREATE CATALOG dlf PROPERTIES ( ### 使用开启了HDFS服务的OSS存储数据 -1. 确认OSS开启了HDFS服务。[开通并授权访问OSS-HDFS服务](https://help.aliyun.com/document_detail/419505.html?spm=a2c4g.2357115.0.i0) -2. 下载SDK。[JindoData SDK下载](https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/5.x/5.0.0-beta7/jindodata_download.md) -3. 解压下载后的jindosdk.tar.gz,将其lib目录下的`jindo-core.jar、jindo-sdk.jar`放到`${DORIS_HOME}/fe/lib`和`${DORIS_HOME}/be/lib/java_extensions`目录下。 +1. 确认OSS开启了HDFS服务。[开通并授权访问OSS-HDFS服务](https://help.aliyun.com/document_detail/419505.html?spm=a2c4g.2357115.0.i0)。 +2. 下载SDK。[JindoData SDK下载](https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/5.x/5.0.0-beta7/jindodata_download.md)。如果集群上已有SDK目录,忽略这一步。 +3. 解压下载后的jindosdk.tar.gz或者在集群上找到Jindo SDK的目录,将其lib目录下的`jindo-core.jar、jindo-sdk.jar`放到`${DORIS_HOME}/fe/lib`和`${DORIS_HOME}/be/lib/java_extensions/preload-extensions`目录下。 4. 创建DLF Catalog,并配置`oss.hdfs.enabled`为`true`: -```sql -CREATE CATALOG dlf_oss_hdfs PROPERTIES ( - "type"="hms", - "hive.metastore.type" = "dlf", - "dlf.proxy.mode" = "DLF_ONLY", - "dlf.endpoint" = "datalake-vpc.cn-beijing.aliyuncs.com", - "dlf.region" = "cn-beijing", - "dlf.uid" = "uid", - "dlf.access_key" = "ak", - "dlf.secret_key" = "sk", - "oss.hdfs.enabled" = "true" -); -``` + ```sql + CREATE CATALOG dlf_oss_hdfs PROPERTIES ( + "type"="hms", + "hive.metastore.type" = "dlf", + "dlf.proxy.mode" = "DLF_ONLY", + "dlf.endpoint" = "datalake-vpc.cn-beijing.aliyuncs.com", + "dlf.region" = "cn-beijing", + "dlf.uid" = "uid", + "dlf.access_key" = "ak", + "dlf.secret_key" = "sk", + "oss.hdfs.enabled" = "true" + ); + ``` + +5. 当Jindo SDK版本与EMR集群上所用的版本不一致时,会出现`Plugin not found`的问题,需更换到对应版本。 ### 访问DLF Iceberg表 diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index 8c9b5552a7..480edd60f2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -406,8 +406,10 @@ public class HiveMetaStoreCache { if (uri.getScheme() != null) { String scheme = uri.getScheme(); updateJobConf("fs." + scheme + ".impl.disable.cache", "true"); - if (!scheme.equals("hdfs") && !scheme.equals("viewfs")) { - updateJobConf("fs." + scheme + ".impl", PropertyConverter.getHadoopFSImplByScheme(scheme)); + if (jobConf.get("fs." + scheme + ".impl") == null) { + if (!scheme.equals("hdfs") && !scheme.equals("viewfs")) { + updateJobConf("fs." + scheme + ".impl", PropertyConverter.getHadoopFSImplByScheme(scheme)); + } } } } catch (Exception e) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/PropertyConverter.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/PropertyConverter.java index 174b0808bc..1b0e3b6d97 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/property/PropertyConverter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/property/PropertyConverter.java @@ -332,8 +332,8 @@ public class PropertyConverter { region + ".oss-dls.aliyuncs.com"); } } - ossProperties.put("fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem"); - ossProperties.put("fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS"); + ossProperties.put("fs.oss.impl", "com.aliyun.jindodata.oss.JindoOssFileSystem"); + ossProperties.put("fs.AbstractFileSystem.oss.impl", "com.aliyun.jindodata.oss.OSS"); } private static Map convertToCOSProperties(Map props, CloudCredential credential) { @@ -454,7 +454,8 @@ public class PropertyConverter { if (!Strings.isNullOrEmpty(region)) { boolean hdfsEnabled = Boolean.parseBoolean(props.getOrDefault(OssProperties.OSS_HDFS_ENABLED, "false")); if (hdfsEnabled) { - props.putIfAbsent("fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem"); + props.putIfAbsent("fs.oss.impl", "com.aliyun.jindodata.oss.JindoOssFileSystem"); + props.put("fs.AbstractFileSystem.oss.impl", "com.aliyun.jindodata.oss.OSS"); props.putIfAbsent(OssProperties.REGION, region); // example: cn-shanghai.oss-dls.aliyuncs.com // from https://www.alibabacloud.com/help/en/e-mapreduce/latest/oss-kusisurumen