From 200b558156ef8ec041655dbdd83eab8491a76f31 Mon Sep 17 00:00:00 2001
From: "jiafeng.zhang" <zhangjf1@gmail.com>
Date: Thu, 11 Aug 2022 09:57:26 +0800
Subject: [PATCH] [typo](doc)spark load uses kerberos authentication method
 (#11662)

spark load uses kerberos authentication method
---
 .../import/import-way/spark-load-manual.md    | 50 +++++++++++++++----
 .../import/import-way/spark-load-manual.md    | 42 +++++++++++++++-
 2 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/docs/en/docs/data-operate/import/import-way/spark-load-manual.md b/docs/en/docs/data-operate/import/import-way/spark-load-manual.md
index d801d3af54..49dfc5628b 100644
--- a/docs/en/docs/data-operate/import/import-way/spark-load-manual.md
+++ b/docs/en/docs/data-operate/import/import-way/spark-load-manual.md
@@ -153,7 +153,11 @@ PROPERTIES
   spark_conf_key = spark_conf_value,
   working_dir = path,
   broker = broker_name,
-  broker.property_key = property_value
+  broker.property_key = property_value,
+  hadoop.security.authentication = kerberos,
+  kerberos_principal = doris@YOUR.COM,
+  kerberos_keytab = /home/doris/my.keytab
+  kerberos_keytab_content = ASDOWHDLAWIDJHWLDKSALDJSDIWALD
 )
 
 -- drop spark resource
@@ -178,7 +182,6 @@ REVOKE USAGE_PRIV ON RESOURCE resource_name FROM ROLE role_name
 `Properties` are the parameters related to spark resources, as follows:
 
 - `type`: resource type, required. Currently, only spark is supported.
-
 - Spark related parameters are as follows:
 
   - `spark.master`: required, yarn is supported at present, `spark://host:port`.
@@ -190,11 +193,12 @@ REVOKE USAGE_PRIV ON RESOURCE resource_name FROM ROLE role_name
   - `spark.hadoop.fs.defaultfs`: required when master is yarn.
 
   - Other parameters are optional, refer to `http://spark.apache.org/docs/latest/configuration.html`
-
 - `working_dir`: directory used by ETL. Spark is required when used as an ETL resource. For example: `hdfs://host :port/tmp/doris`.
-
+- `hadoop.security.authentication`: Specify the authentication method as kerberos.
+- `kerberos_principal`: Specify the principal of kerberos.
+- `kerberos_keytab`: Specify the path to the keytab file for kerberos. The file must be an absolute path to a file on the server where the broker process is located. And can be accessed by the Broker process.
+- `kerberos_keytab_content`: Specify the content of the keytab file in kerberos after base64 encoding. You can choose one of these with `kerberos_keytab` configuration.
 - `broker`: the name of the broker. Spark is required when used as an ETL resource. You need to use the 'alter system add broker' command to complete the configuration in advance.
-
 - `broker.property_key`: the authentication information that the broker needs to specify when reading the intermediate file generated by ETL.
 
 Example:
@@ -231,6 +235,38 @@ PROPERTIES
 );
 ```
 
+**Spark Load supports Kerberos authentication**
+
+If Spark load accesses Hadoop cluster resources with Kerberos authentication, we only need to specify the following parameters when creating Spark resources:
+
+- `hadoop.security.authentication`: Specify the authentication method as kerberos.
+- `kerberos_principal`: Specify the principal of kerberos.
+- `kerberos_keytab`: Specify the path to the keytab file for kerberos. The file must be an absolute path to a file on the server where the broker process is located. And can be accessed by the Broker process.
+- `kerberos_keytab_content`: Specify the content of the keytab file in kerberos after base64 encoding. You can choose one of these with `kerberos_keytab` configuration.
+
+Example：
+
+```sql
+CREATE EXTERNAL RESOURCE "spark_on_kerberos"
+PROPERTIES
+(
+  "type" = "spark",
+  "spark.master" = "yarn",
+  "spark.submit.deployMode" = "cluster",
+  "spark.jars" = "xxx.jar,yyy.jar",
+  "spark.files" = "/tmp/aaa,/tmp/bbb",
+  "spark.executor.memory" = "1g",
+  "spark.yarn.queue" = "queue0",
+  "spark.hadoop.yarn.resourcemanager.address" = "127.0.0.1:9999",
+  "spark.hadoop.fs.defaultFS" = "hdfs://127.0.0.1:10000",
+  "working_dir" = "hdfs://127.0.0.1:10000/tmp/doris",
+  "broker" = "broker0",
+  "hadoop.security.authentication" = "kerberos",
+  "kerberos_principal" = "doris@YOUR.COM",
+  "kerberos_keytab" = "/home/doris/my.keytab"
+);
+```
+
 **Show resources**
 
 Ordinary accounts can only see the resources that they have `USAGE_PRIV` to use.
@@ -248,22 +284,18 @@ You can use the `USAGE_PRIV` permission is given to a user or a role, and the ro
 
 GRANT USAGE_PRIV ON RESOURCE "spark0" TO "user0"@"%";
 
-
 -- Grant permission to the spark0 resource to role ROLE0
 
 GRANT USAGE_PRIV ON RESOURCE "spark0" TO ROLE "role0";
 
-
 -- Grant permission to all resources to user user0
 
 GRANT USAGE_PRIV ON RESOURCE * TO "user0"@"%";
 
-
 -- Grant permission to all resources to role ROLE0
 
 GRANT USAGE_PRIV ON RESOURCE * TO ROLE "role0";
 
-
 -- Revoke the spark0 resource permission of user user0
 
 REVOKE USAGE_PRIV ON RESOURCE "spark0" FROM "user0"@"%";
diff --git a/docs/zh-CN/docs/data-operate/import/import-way/spark-load-manual.md b/docs/zh-CN/docs/data-operate/import/import-way/spark-load-manual.md
index b6269ea176..d8bc642296 100644
--- a/docs/zh-CN/docs/data-operate/import/import-way/spark-load-manual.md
+++ b/docs/zh-CN/docs/data-operate/import/import-way/spark-load-manual.md
@@ -126,7 +126,11 @@ PROPERTIES
   spark_conf_key = spark_conf_value,
   working_dir = path,
   broker = broker_name,
-  broker.property_key = property_value
+  broker.property_key = property_value,
+  hadoop.security.authentication = kerberos,
+  kerberos_principal = doris@YOUR.COM,
+  kerberos_keytab = /home/doris/my.keytab
+  kerberos_keytab_content = ASDOWHDLAWIDJHWLDKSALDJSDIWALD
 )
 
 -- drop spark resource
@@ -158,6 +162,10 @@ REVOKE USAGE_PRIV ON RESOURCE resource_name FROM ROLE role_name
   - `spark.hadoop.fs.defaultFS`: master为yarn时必填。
   - 其他参数为可选，参考http://spark.apache.org/docs/latest/configuration.html
 - `working_dir`: ETL 使用的目录。spark作为ETL资源使用时必填。例如：hdfs://host:port/tmp/doris。
+- `hadoop.security.authentication`：指定认证方式为 kerberos。
+- `kerberos_principal`：指定 kerberos 的 principal。
+- `kerberos_keytab`：指定 kerberos 的 keytab 文件路径。该文件必须为 Broker 进程所在服务器上的文件的绝对路径。并且可以被 Broker 进程访问。
+- `kerberos_keytab_content`：指定 kerberos 中 keytab 文件内容经过 base64 编码之后的内容。这个跟 `kerberos_keytab` 配置二选一即可。
 - `broker`: broker 名字。spark 作为 ETL 资源使用时必填。需要使用 `ALTER SYSTEM ADD BROKER` 命令提前完成配置。
   - `broker.property_key`: broker 读取 ETL 生成的中间文件时需要指定的认证信息等。
 
@@ -195,6 +203,38 @@ PROPERTIES
 );
 ```
 
+**Spark Load 支持 Kerberos 认证**
+
+如果是 Spark load 访问带有 Kerberos 认证的 Hadoop 集群资源，我们只需要在创建 Spark resource 的时候指定以下参数即可：
+
+- `hadoop.security.authentication`：指定认证方式为 kerberos。
+- `kerberos_principal`：指定 kerberos 的 principal。
+- `kerberos_keytab`：指定 kerberos 的 keytab 文件路径。该文件必须为 Broker 进程所在服务器上的文件的绝对路径。并且可以被 Broker 进程访问。
+- `kerberos_keytab_content`：指定 kerberos 中 keytab 文件内容经过 base64 编码之后的内容。这个跟 `kerberos_keytab` 配置二选一即可。
+
+实例：
+
+```sql
+CREATE EXTERNAL RESOURCE "spark_on_kerberos"
+PROPERTIES
+(
+  "type" = "spark",
+  "spark.master" = "yarn",
+  "spark.submit.deployMode" = "cluster",
+  "spark.jars" = "xxx.jar,yyy.jar",
+  "spark.files" = "/tmp/aaa,/tmp/bbb",
+  "spark.executor.memory" = "1g",
+  "spark.yarn.queue" = "queue0",
+  "spark.hadoop.yarn.resourcemanager.address" = "127.0.0.1:9999",
+  "spark.hadoop.fs.defaultFS" = "hdfs://127.0.0.1:10000",
+  "working_dir" = "hdfs://127.0.0.1:10000/tmp/doris",
+  "broker" = "broker0",
+  "hadoop.security.authentication" = "kerberos",
+  "kerberos_principal" = "doris@YOUR.COM",
+  "kerberos_keytab" = "/home/doris/my.keytab"
+);
+```
+
 **查看资源**
 
 普通账户只能看到自己有 USAGE_PRIV 使用权限的资源。