[docs] Add user manual for hdfs load and transaction. (#7497)

2021-12-30 10:22:48 +08:00
parent 0894848045
commit dc9cd34047
15 changed files with 492 additions and 341 deletions
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@ -643,13 +643,13 @@ CONF_mInt32(external_table_connect_timeout_sec, "5");
 CONF_mInt32(segment_cache_capacity, "1000000");

 // s3 config
-CONF_String(s3_ak, "");
-CONF_String(s3_sk, "");
-CONF_String(s3_endpoint, "");
-CONF_String(s3_region, "");
-CONF_mInt32(s3_max_conn, "50");
-CONF_mInt32(s3_request_timeout_ms, "3000");
-CONF_mInt32(s3_conn_timeout_ms, "1000");
+CONF_String(default_remote_storage_s3_ak, "");
+CONF_String(default_remote_storage_s3_sk, "");
+CONF_String(default_remote_storage_s3_endpoint, "");
+CONF_String(default_remote_storage_s3_region, "");
+CONF_mInt32(default_remote_storage_s3_max_conn, "50");
+CONF_mInt32(default_remote_storage_s3_request_timeout_ms, "3000");
+CONF_mInt32(default_remote_storage_s3_conn_timeout_ms, "1000");
 // Set to true to disable the minidump feature.
 CONF_Bool(disable_minidump , "false");

--- a/be/src/env/env_remote.cpp
+++ b/be/src/env/env_remote.cpp
@ -158,13 +158,17 @@ private:

 Status RemoteEnv::init_conf() {
    std::map<std::string, std::string> storage_prop;
-    storage_prop[S3_AK] = doris::config::s3_ak;
-    storage_prop[S3_SK] = doris::config::s3_sk;
-    storage_prop[S3_ENDPOINT] = doris::config::s3_endpoint;
-    storage_prop[S3_REGION] = doris::config::s3_region;
-    storage_prop[S3_MAX_CONN_SIZE] = std::to_string(doris::config::s3_max_conn);
-    storage_prop[S3_REQUEST_TIMEOUT_MS] = std::to_string(doris::config::s3_request_timeout_ms);
-    storage_prop[S3_CONN_TIMEOUT_MS] = std::to_string(doris::config::s3_conn_timeout_ms);
+    if (doris::config::default_remote_storage_s3_ak.empty() || doris::config::default_remote_storage_s3_sk.empty()
+            || doris::config::default_remote_storage_s3_endpoint.empty() || doris::config::default_remote_storage_s3_region.empty()) {
+        return Status::OK();
+    }
+    storage_prop[S3_AK] = doris::config::default_remote_storage_s3_ak;
+    storage_prop[S3_SK] = doris::config::default_remote_storage_s3_sk;
+    storage_prop[S3_ENDPOINT] = doris::config::default_remote_storage_s3_endpoint;
+    storage_prop[S3_REGION] = doris::config::default_remote_storage_s3_region;
+    storage_prop[S3_MAX_CONN_SIZE] = std::to_string(doris::config::default_remote_storage_s3_max_conn);
+    storage_prop[S3_REQUEST_TIMEOUT_MS] = std::to_string(doris::config::default_remote_storage_s3_request_timeout_ms);
+    storage_prop[S3_CONN_TIMEOUT_MS] = std::to_string(doris::config::default_remote_storage_s3_conn_timeout_ms);

    if (ClientFactory::is_s3_conf_valid(storage_prop)) {
        _storage_backend.reset(new S3StorageBackend(storage_prop));
--- a/be/src/exec/hdfs_writer.cpp
+++ b/be/src/exec/hdfs_writer.cpp
@ -193,6 +193,13 @@ Status HDFSWriter::_parse_properties(std::map<std::string, std::string>& prop) {
        LOG(ERROR) << "hdfs properties is incorrect.";
        return Status::InternalError("hdfs properties is incorrect");
    }
+
+    // if the format of _path is hdfs://ip:port/path, replace it to /path.
+    // path like hdfs://ip:port/path can't be used by libhdfs3.
+    if (_path.find(_namenode) != _path.npos) {
+        _path = _path.substr(_namenode.size());
+    }
+
    return Status::OK();
 }

--- a/be/src/olap/data_dir.cpp
+++ b/be/src/olap/data_dir.cpp
@ -71,12 +71,12 @@ DataDir::DataDir(const std::string& path, int64_t capacity_bytes,
          _disk_capacity_bytes(0),
          _storage_medium(storage_medium),
          _is_used(false),
+          _env(Env::get_env(storage_medium)),
          _tablet_manager(tablet_manager),
          _txn_manager(txn_manager),
          _cluster_id(-1),
          _cluster_id_incomplete(false),
          _to_be_deleted(false),
-          _env(Env::get_env(storage_medium)),
          _current_shard(0),
          _meta(nullptr) {
    _path_desc.storage_medium = storage_medium;
--- a/be/src/runtime/export_sink.cpp
+++ b/be/src/runtime/export_sink.cpp
@ -22,6 +22,7 @@
 #include <sstream>

 #include "exec/broker_writer.h"
+#include "exec/hdfs_reader_writer.h"
 #include "exec/local_file_writer.h"
 #include "exec/s3_writer.h"
 #include "exprs/expr.h"
@ -258,6 +259,15 @@ Status ExportSink::open_file_writer() {
        _file_writer.reset(s3_writer);
        break;
    }
+    case TFileType::FILE_HDFS: {
+        FileWriter* hdfs_writer;
+        RETURN_IF_ERROR(HdfsReaderWriter::create_writer(
+                const_cast<std::map<std::string, std::string>&>(_t_export_sink.properties),
+                _t_export_sink.export_path + "/" + file_name, &hdfs_writer));
+        RETURN_IF_ERROR(hdfs_writer->open());
+        _file_writer.reset(hdfs_writer);
+        break;
+    }
    default: {
        std::stringstream ss;
        ss << "Unknown file type, type=" << _t_export_sink.file_type;
--- a/docs/.vuepress/sidebar/en.js
+++ b/docs/.vuepress/sidebar/en.js
@ -563,6 +563,7 @@ module.exports = [
            title: "Data Manipulation",
            directoryPath: "Data Manipulation/",
            children: [
+              "BEGIN",
              "BROKER LOAD",
              "CANCEL DELETE",
              "CANCEL LABEL",
@ -575,6 +576,7 @@ module.exports = [
              "LOAD",
              "MINI LOAD",
              "MULTI LOAD",
+              "OUTFILE",
              "PAUSE ROUTINE LOAD",
              "PAUSE SYNC JOB",
              "RESTORE TABLET",
--- a/docs/.vuepress/sidebar/zh-CN.js
+++ b/docs/.vuepress/sidebar/zh-CN.js
@ -569,6 +569,7 @@ module.exports = [
            title: "DML",
            directoryPath: "Data Manipulation/",
            children: [
+              "BEGIN",
              "BROKER LOAD",
              "CANCEL LOAD",
              "CREATE SYNC JOB",
@ -578,6 +579,7 @@ module.exports = [
              "LOAD",
              "MINI LOAD",
              "MULTI LOAD",
+              "OUTFILE",
              "PAUSE ROUTINE LOAD",
              "PAUSE SYNC JOB",
              "RESUME ROUTINE LOAD",
--- a/docs/en/administrator-guide/outfile.md
+++ b/docs/en/administrator-guide/outfile.md
@ -139,166 +139,7 @@ Planning example for concurrent export:

 ## Usage example

-1. Example 1
-
-    Export simple query results to the file `hdfs://path/to/result.txt`. Specify the export format as CSV. Use `my_broker` and set kerberos authentication information. Specify the column separator as `,` and the line delimiter as `\n`.
-    
-    ```
-    SELECT * FROM tbl
-    INTO OUTFILE "hdfs://path/to/result_"
-    FORMAT AS CSV
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.hadoop.security.authentication" = "kerberos",
-        "broker.kerberos_principal" = "doris@YOUR.COM",
-        "broker.kerberos_keytab" = "/home/doris/my.keytab",
-        "column_separator" = ",",
-        "line_delimiter" = "\n",
-        "max_file_size" = "100MB"
-    );
-    ```
-    
-    If the result is less than 100MB, file will be: `result_0.csv`.
-    
-    If larger than 100MB, may be: `result_0.csv, result_1.csv, ...`.
-
-2. Example 2
-
-    Export simple query results to the file `hdfs://path/to/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set kerberos authentication information. 
-    
-    ```
-    SELECT c1, c2, c3 FROM tbl
-    INTO OUTFILE "hdfs://path/to/result_"
-    FORMAT AS PARQUET
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.hadoop.security.authentication" = "kerberos",
-        "broker.kerberos_principal" = "doris@YOUR.COM",
-        "broker.kerberos_keytab" = "/home/doris/my.keytab",
-        "schema"="required,int32,c1;required,byte_array,c2;required,byte_array,c2"
-    );
-    ```
-   
-   If the exported file format is PARQUET, `schema` must be specified.
-
-3. Example 3
-
-    Export the query result of the CTE statement to the file `hdfs://path/to/result.txt`. The default export format is CSV. Use `my_broker` and set hdfs high availability information. Use the default column separators and line delimiter.
-
-    ```
-    WITH
-    x1 AS
-    (SELECT k1, k2 FROM tbl1),
-    x2 AS
-    (SELECT k3 FROM tbl2)
-    SELEC k1 FROM x1 UNION SELECT k3 FROM x2
-    INTO OUTFILE "hdfs://path/to/result_"
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.username"="user",
-        "broker.password"="passwd",
-        "broker.dfs.nameservices" = "my_ha",
-        "broker.dfs.ha.namenodes.my_ha" = "my_namenode1, my_namenode2",
-        "broker.dfs.namenode.rpc-address.my_ha.my_namenode1" = "nn1_host:rpc_port",
-        "broker.dfs.namenode.rpc-address.my_ha.my_namenode2" = "nn2_host:rpc_port",
-        "broker.dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
-    );
-    ```
-    
-    If the result is less than 1GB, file will be: `result_0.csv`.
-    
-    If larger than 1GB, may be: `result_0.csv, result_1.csv, ...`.
-    
-4. Example 4
-
-    Export the query results of the UNION statement to the file `bos://bucket/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set hdfs high availability information. PARQUET format does not need to specify the column separator and line delimiter.
-    
-    ```
-    SELECT k1 FROM tbl1 UNION SELECT k2 FROM tbl1
-    INTO OUTFILE "bos://bucket/result_"
-    FORMAT AS PARQUET
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.bos_endpoint" = "http://bj.bcebos.com",
-        "broker.bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx",
-        "broker.bos_secret_accesskey" = "yyyyyyyyyyyyyyyyyyyyyyyyyy",
-        "schema"="required,int32,k1;required,byte_array,k2"
-    );
-    ```
-
-5. Example 5
-
-    Export simple query results to the file `cos://${bucket_name}/path/result.txt`. Specify the export format as CSV.
-    And create a mark file after export finished.
-    
-    ```
-    select k1,k2,v1 from tbl1 limit 100000
-    into outfile "s3a://my_bucket/export/my_file_"
-    FORMAT AS CSV
-    PROPERTIES
-    (
-       "broker.name" = "hdfs_broker",
-       "broker.fs.s3a.access.key" = "xxx",
-       "broker.fs.s3a.secret.key" = "xxxx",
-       "broker.fs.s3a.endpoint" = "https://cos.xxxxxx.myqcloud.com/",
-       "column_separator" = ",",
-       "line_delimiter" = "\n",
-       "max_file_size" = "1024MB",
-       "success_file_name" = "SUCCESS"
-    )
-    ```
-    
-    If the result is less than 1GB, file will be: `my_file_0.csv`.
-    
-    If larger than 1GB, may be: `my_file_0.csv, result_1.csv, ...`.
-    
-    Please Note: 
-    1. Paths that do not exist are automatically created.
-    2. These parameters(access.key/secret.key/endpointneed) need to be confirmed with `Tecent Cloud COS`. In particular, the value of endpoint does not need to be filled in bucket_name.
-
-6. Example 6
-
-    Use the s3 protocol to export to bos, and concurrent export is enabled.
-
-    ```
-    set enable_parallel_outfile = true;
-    select k1 from tb1 limit 1000
-    into outfile "s3://my_bucket/export/my_file_"
-    format as csv
-    properties
-    (
-        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
-        "AWS_ACCESS_KEY" = "xxxx",
-        "AWS_SECRET_KEY" = "xxx",
-        "AWS_REGION" = "bd"
-    )
-    ```
-
-    The final generated file prefix is `my_file_{fragment_instance_id}_`。
-
-7. Example 7
-
-    Use the s3 protocol to export to bos, and enable concurrent export of session variables.
-
-    ```
-    set enable_parallel_outfile = true;
-    select k1 from tb1 order by k1 limit 1000
-    into outfile "s3://my_bucket/export/my_file_"
-    format as csv
-    properties
-    (
-        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
-        "AWS_ACCESS_KEY" = "xxxx",
-        "AWS_SECRET_KEY" = "xxx",
-        "AWS_REGION" = "bd"
-    )
-    ```
-
-    **But because the query statement has a top-level sorting node, even if the query is enabled for concurrently exported session variables, it cannot be exported concurrently.**
+For details, please refer to [OUTFILE Document](../sql-reference/sql-statements/Data%20Manipulation/OUTFILE.md).

 ## Return result

--- a/docs/en/sql-reference/sql-statements/Data
+++ b/docs/en/sql-reference/sql-statements/Data
@ -222,9 +222,11 @@ under the License.
            (
                "fs.defaultFS" = "",
                "hdfs_user"="",
-                "kerb_principal" = "",
-                "kerb_ticket_cache_path" = "",
-                "kerb_token" = ""
+                "dfs.nameservices"="my_ha",
+                "dfs.ha.namenodes.xxx"="my_nn1,my_nn2",
+                "dfs.namenode.rpc-address.xxx.my_nn1"="host1:port",
+                "dfs.namenode.rpc-address.xxx.my_nn2"="host2:port",
+                "dfs.client.failover.proxy.provider.xxx"="org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
            )
            fs.defaultFS: defaultFS
            hdfs_user: hdfs user
--- a/docs/en/sql-reference/sql-statements/Data
+++ b/docs/en/sql-reference/sql-statements/Data
@ -74,6 +74,15 @@ under the License.
        For brokers corresponding to different storage systems, the input parameters are different. Specific parameters can be referred to: `help broker load', broker required properties.
        When exporting to local, you do not need to fill in this part.

+    7. hdfs
+      Specify to use libhdfs export to hdfs
+          Grammar：
+          WITH HDFS ("key"="value"[,...])
+
+          The following parameters can be specified:
+            fs.defaultFS: Set the fs such as：hdfs://ip:port
+            hdfs_user：Specify hdfs user name
+
 ## example

    1. Export all data from the testTbl table to HDFS
@ -97,5 +106,8 @@ under the License.
    7. Export column k1, v1 from the testTbl to the local.
       EXPORT TABLE testTbl TO "file:///home/data/a" PROPERTIES ("columns" = "k1,v1");

+    8. Export all data in the testTbl table to hdfs, using the invisible character "\x07" as the column and row separator. 
+        EXPORT TABLE testTbl TO "hdfs://hdfs_host:port/a/b/c" PROPERTIES ("column_separator"="\\x07", "line_delimiter" = "\\x07") WITH HDFS ("fs.defaultFS"="hdfs://hdfs_host:port", "hdfs_user"="yyy")
+
 ## keyword
    EXPORT
--- a/docs/en/sql-reference/sql-statements/Data
+++ b/docs/en/sql-reference/sql-statements/Data
@ -0,0 +1,207 @@
+---
+{
+    "title": "OUTFILE",
+    "language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# OUTFILE
+## description
+
+    The `SELECT INTO OUTFILE` statement can export the query results to a file. Currently supports export to remote storage through Broker process, or directly through S3, HDFS  protocol such as HDFS, S3, BOS and COS(Tencent Cloud) through the Broker process. The syntax is as follows:
+
+    Grammar：
+        query_stmt
+        INTO OUTFILE "file_path"
+        [format_as]
+        [properties]
+
+    1. file_path
+        `file_path` specify the file path and file name prefix. Like: `hdfs://path/to/my_file_`.
+        The final file name will be assembled as `my_file_`, file seq no and the format suffix. File seq no starts from 0, determined by the number of split.
+            my_file_abcdefg_0.csv
+            my_file_abcdefg_1.csv
+            my_file_abcdegf_2.csv
+
+    2. format_as
+        FORMAT AS CSV
+        Specify the export format. The default is CSV.
+
+
+    3. properties
+        Specify the relevant attributes. Currently it supports exporting through the Broker process, or through the S3, HDFS protocol.
+
+        Grammar：
+        [PROPERTIES ("key"="value", ...)]
+        The following parameters can be specified:
+          column_separator: Specifies the exported column separator, defaulting to t. Supports invisible characters, such as'\x07'.
+          line_delimiter: Specifies the exported line separator, defaulting to\n. Supports invisible characters, such as'\x07'.
+          max_file_size: max size for each file
+
+        Broker related attributes need to be prefixed with `broker.`:
+        broker.name: broker name
+        broker.hadoop.security.authentication: Specify authentication as kerberos
+        broker.kerberos_principal: Specify the principal of kerberos
+        broker.kerberos_keytab: Specify the keytab path of kerberos, this file is the path on the broker.
+
+        HDFS protocal can directly execute HDFS protocal configuration:
+        hdfs.fs.defaultFS: namenode ip and port
+        hdfs.hdfs_user: hdfs user name
+
+        S3 protocol can directly execute S3 protocol configuration:
+        AWS_ENDPOINT
+        AWS_ACCESS_KEY
+        AWS_SECRET_KEY
+        AWS_REGION
+
+## example
+
+    1. Export simple query results to the file `hdfs://path/to/result.txt`. Specify the export format as CSV. Use `my_broker` and set kerberos authentication information. Specify the column separator as `,` and the line delimiter as `\n`.
+    SELECT * FROM tbl
+    INTO OUTFILE "hdfs://path/to/result_"
+    FORMAT AS CSV
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.hadoop.security.authentication" = "kerberos",
+        "broker.kerberos_principal" = "doris@YOUR.COM",
+        "broker.kerberos_keytab" = "/home/doris/my.keytab",
+        "column_separator" = ",",
+        "line_delimiter" = "\n",
+        "max_file_size" = "100MB"
+    );
+    If the result is less than 100MB, file will be: `result_0.csv`.
+    If larger than 100MB, may be: `result_0.csv, result_1.csv, ...`.
+
+    2. Export simple query results to the file `hdfs://path/to/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set kerberos authentication information. 
+    SELECT c1, c2, c3 FROM tbl
+    INTO OUTFILE "hdfs://path/to/result_"
+    FORMAT AS PARQUET
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.hadoop.security.authentication" = "kerberos",
+        "broker.kerberos_principal" = "doris@YOUR.COM",
+        "broker.kerberos_keytab" = "/home/doris/my.keytab",
+        "schema"="required,int32,c1;required,byte_array,c2;required,byte_array,c2"
+    );
+    If the exported file format is PARQUET, `schema` must be specified.
+
+    3. Export the query result of the CTE statement to the file `hdfs://path/to/result.txt`. The default export format is CSV. Use `my_broker` and set hdfs high availability information. Use the default column separators and line delimiter.
+    WITH
+    x1 AS
+    (SELECT k1, k2 FROM tbl1),
+    x2 AS
+    (SELECT k3 FROM tbl2)
+    SELEC k1 FROM x1 UNION SELECT k3 FROM x2
+    INTO OUTFILE "hdfs://path/to/result_"
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.username"="user",
+        "broker.password"="passwd",
+        "broker.dfs.nameservices" = "my_ha",
+        "broker.dfs.ha.namenodes.my_ha" = "my_namenode1, my_namenode2",
+        "broker.dfs.namenode.rpc-address.my_ha.my_namenode1" = "nn1_host:rpc_port",
+        "broker.dfs.namenode.rpc-address.my_ha.my_namenode2" = "nn2_host:rpc_port",
+        "broker.dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
+    );
+    If the result is less than 1GB, file will be: `result_0.csv`.
+    If larger than 1GB, may be: `result_0.csv, result_1.csv, ...`.
+    
+    4. Export the query results of the UNION statement to the file `bos://bucket/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set hdfs high availability information. PARQUET format does not need to specify the column separator and line delimiter.
+    SELECT k1 FROM tbl1 UNION SELECT k2 FROM tbl1
+    INTO OUTFILE "bos://bucket/result_"
+    FORMAT AS PARQUET
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.bos_endpoint" = "http://bj.bcebos.com",
+        "broker.bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx",
+        "broker.bos_secret_accesskey" = "yyyyyyyyyyyyyyyyyyyyyyyyyy",
+        "schema"="required,int32,k1;required,byte_array,k2"
+    );
+
+    5. Export simple query results to the file `cos://${bucket_name}/path/result.txt`. Specify the export format as CSV.
+    And create a mark file after export finished.
+    select k1,k2,v1 from tbl1 limit 100000
+    into outfile "s3a://my_bucket/export/my_file_"
+    FORMAT AS CSV
+    PROPERTIES
+    (
+       "broker.name" = "hdfs_broker",
+       "broker.fs.s3a.access.key" = "xxx",
+       "broker.fs.s3a.secret.key" = "xxxx",
+       "broker.fs.s3a.endpoint" = "https://cos.xxxxxx.myqcloud.com/",
+       "column_separator" = ",",
+       "line_delimiter" = "\n",
+       "max_file_size" = "1024MB",
+       "success_file_name" = "SUCCESS"
+    )
+    Please Note: 
+        1. Paths that do not exist are automatically created.
+        2. These parameters(access.key/secret.key/endpointneed) need to be confirmed with `Tecent Cloud COS`. In particular, the value of endpoint does not need to be filled in bucket_name.
+
+    6. Use the s3 protocol to export to bos, and concurrent export is enabled.
+    set enable_parallel_outfile = true;
+    select k1 from tb1 limit 1000
+    into outfile "s3://my_bucket/export/my_file_"
+    format as csv
+    properties
+    (
+        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
+        "AWS_ACCESS_KEY" = "xxxx",
+        "AWS_SECRET_KEY" = "xxx",
+        "AWS_REGION" = "bd"
+    )
+    The final generated file prefix is `my_file_{fragment_instance_id}_`。
+
+    7. Use the s3 protocol to export to bos, and enable concurrent export of session variables.
+    set enable_parallel_outfile = true;
+    select k1 from tb1 order by k1 limit 1000
+    into outfile "s3://my_bucket/export/my_file_"
+    format as csv
+    properties
+    (
+        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
+        "AWS_ACCESS_KEY" = "xxxx",
+        "AWS_SECRET_KEY" = "xxx",
+        "AWS_REGION" = "bd"
+    )
+    But because the query statement has a top-level sorting node, even if the query is enabled for concurrently exported session variables, it cannot be exported concurrently.
+
+    8. Use libhdfs to export to hdfs cluster. Export the query results of the UNION statement to the file `hdfs://path/to/result.txt`
+        Specify the export format as CSV. Use the user name as 'work', the column separators as ',' and line delimiter as '\n'.
+    SELECT * FROM tbl
+    INTO OUTFILE "hdfs://path/to/result_"
+    FORMAT AS CSV
+    PROPERTIES
+    (
+        "hdfs.fs.defaultFS" = "hdfs://ip:port",
+        "hdfs.hdfs_user" = "work"
+    );
+    If the result is less than 1GB, file will be: `my_file_0.csv`.
+    If larger than 1GB, may be: `my_file_0.csv, result_1.csv, ...`.
+
+## keyword
+    OUTFILE
+
--- a/docs/zh-CN/administrator-guide/outfile.md
+++ b/docs/zh-CN/administrator-guide/outfile.md
@ -138,167 +138,8 @@ explain select xxx from xxx where xxx  into outfile "s3://xxx" format as csv pro

 ## 使用示例

-1. 示例1
+具体参阅[OUTFILE 文档](../sql-reference/sql-statements/Data%20Manipulation/OUTFILE.md)。

-    使用 broker 方式导出，将简单查询结果导出到文件 `hdfs://path/to/result.txt`。指定导出格式为 CSV。使用 `my_broker` 并设置 kerberos 认证信息。指定列分隔符为 `,`，行分隔符为 `\n`。
-
-    ```
-    SELECT * FROM tbl
-    INTO OUTFILE "hdfs://path/to/result_"
-    FORMAT AS CSV
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.hadoop.security.authentication" = "kerberos",
-        "broker.kerberos_principal" = "doris@YOUR.COM",
-        "broker.kerberos_keytab" = "/home/doris/my.keytab",
-        "column_separator" = ",",
-        "line_delimiter" = "\n",
-        "max_file_size" = "100MB"
-    );
-    ```
-    
-    最终生成文件如如果不大于 100MB，则为：`result_0.csv`。
-    
-    如果大于 100MB，则可能为 `result_0.csv, result_1.csv, ...`。
-
-2. 示例2
-
-    将简单查询结果导出到文件 `hdfs://path/to/result.parquet`。指定导出格式为 PARQUET。使用 `my_broker` 并设置 kerberos 认证信息。
-
-    ```
-    SELECT c1, c2, c3 FROM tbl
-    INTO OUTFILE "hdfs://path/to/result_"
-    FORMAT AS PARQUET
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.hadoop.security.authentication" = "kerberos",
-        "broker.kerberos_principal" = "doris@YOUR.COM",
-        "broker.kerberos_keytab" = "/home/doris/my.keytab",
-        "schema"="required,int32,c1;required,byte_array,c2;required,byte_array,c2"
-    );
-    ```
-   
-   查询结果导出到parquet文件需要明确指定`schema`。
-
-3. 示例3
-
-    将 CTE 语句的查询结果导出到文件 `hdfs://path/to/result.txt`。默认导出格式为 CSV。使用 `my_broker` 并设置 hdfs 高可用信息。使用默认的行列分隔符。
-
-    ```
-    WITH
-    x1 AS
-    (SELECT k1, k2 FROM tbl1),
-    x2 AS
-    (SELECT k3 FROM tbl2)
-    SELEC k1 FROM x1 UNION SELECT k3 FROM x2
-    INTO OUTFILE "hdfs://path/to/result_"
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.username"="user",
-        "broker.password"="passwd",
-        "broker.dfs.nameservices" = "my_ha",
-        "broker.dfs.ha.namenodes.my_ha" = "my_namenode1, my_namenode2",
-        "broker.dfs.namenode.rpc-address.my_ha.my_namenode1" = "nn1_host:rpc_port",
-        "broker.dfs.namenode.rpc-address.my_ha.my_namenode2" = "nn2_host:rpc_port",
-        "broker.dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
-    );
-    ```
-    
-    最终生成文件如如果不大于 1GB，则为：`result_0.csv`。
-    
-    如果大于 1GB，则可能为 `result_0.csv, result_1.csv, ...`。
-    
-4. 示例4
-
-    将 UNION 语句的查询结果导出到文件 `bos://bucket/result.txt`。指定导出格式为 PARQUET。使用 `my_broker` 并设置 hdfs 高可用信息。PARQUET 格式无需指定列分割符。
-    导出完成后，生成一个标识文件。
-    
-    ```
-    SELECT k1 FROM tbl1 UNION SELECT k2 FROM tbl1
-    INTO OUTFILE "bos://bucket/result_"
-    FORMAT AS PARQUET
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.bos_endpoint" = "http://bj.bcebos.com",
-        "broker.bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx",
-        "broker.bos_secret_accesskey" = "yyyyyyyyyyyyyyyyyyyyyyyyyy",
-        "schema"="required,int32,k1;required,byte_array,k2"
-    );
-    ```
-
-5. 示例5
-
-    将 select 语句的查询结果导出到文件 `cos://${bucket_name}/path/result.txt`。指定导出格式为 csv。
-    导出完成后，生成一个标识文件。
-
-    ```
-    select k1,k2,v1 from tbl1 limit 100000
-    into outfile "s3a://my_bucket/export/my_file_"
-    FORMAT AS CSV
-    PROPERTIES
-    (
-        "broker.name" = "hdfs_broker",
-        "broker.fs.s3a.access.key" = "xxx",
-        "broker.fs.s3a.secret.key" = "xxxx",
-        "broker.fs.s3a.endpoint" = "https://cos.xxxxxx.myqcloud.com/",
-        "column_separator" = ",",
-        "line_delimiter" = "\n",
-        "max_file_size" = "1024MB",
-        "success_file_name" = "SUCCESS"
-    )
-    ```
-    最终生成文件如如果不大于 1GB，则为：`my_file_0.csv`。
-
-    如果大于 1GB，则可能为 `my_file_0.csv, result_1.csv, ...`。
-
-    在cos上验证
-    1. 不存在的path会自动创建
-    2. access.key/secret.key/endpoint需要和cos的同学确认。尤其是endpoint的值，不需要填写bucket_name。
-
-6. 示例6
-
-    使用 s3 协议导出到 bos，并且并发导出开启。
-
-    ```
-    set enable_parallel_outfile = true;
-    select k1 from tb1 limit 1000
-    into outfile "s3://my_bucket/export/my_file_"
-    format as csv
-    properties
-    (
-        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
-        "AWS_ACCESS_KEY" = "xxxx",
-        "AWS_SECRET_KEY" = "xxx",
-        "AWS_REGION" = "bd"
-    )
-    ```
-
-    最终生成的文件前缀为 `my_file_{fragment_instance_id}_`。
-
-7. 示例7
-
-    使用 s3 协议导出到 bos，并且并发导出 session 变量开启。
-
-    ```
-    set enable_parallel_outfile = true;
-    select k1 from tb1 order by k1 limit 1000
-    into outfile "s3://my_bucket/export/my_file_"
-    format as csv
-    properties
-    (
-        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
-        "AWS_ACCESS_KEY" = "xxxx",
-        "AWS_SECRET_KEY" = "xxx",
-        "AWS_REGION" = "bd"
-    )
-    ```
-
-    **但由于查询语句带了一个顶层的排序节点，所以这个查询即使开启并发导出的 session 变量，也是无法并发导出的。**
-    
 ## 返回结果

 导出命令为同步命令。命令返回，即表示操作结束。同时会返回一行结果来展示导出的执行结果。
--- a/docs/zh-CN/sql-reference/sql-statements/Data
+++ b/docs/zh-CN/sql-reference/sql-statements/Data
@ -221,9 +221,11 @@ under the License.
            (
                "fs.defaultFS" = "",
                "hdfs_user"="",
-                "kerb_principal" = "",
-                "kerb_ticket_cache_path" = "",
-                "kerb_token" = ""
+                "dfs.nameservices"="my_ha",
+                "dfs.ha.namenodes.xxx"="my_nn1,my_nn2",
+                "dfs.namenode.rpc-address.xxx.my_nn1"="host1:port",
+                "dfs.namenode.rpc-address.xxx.my_nn2"="host2:port",
+                "dfs.client.failover.proxy.provider.xxx"="org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
            )
            fs.defaultFS: hdfs集群defaultFS
            hdfs_user: 连接hdfs集群时使用的用户名
--- a/docs/zh-CN/sql-reference/sql-statements/Data
+++ b/docs/zh-CN/sql-reference/sql-statements/Data
@ -74,6 +74,15 @@ under the License.
      对于不同存储系统对应的 broker，这里需要输入的参数不同。具体参数可以参阅：`help broker load` 中 broker 所需属性。
      导出到本地时，不需要填写这部分。

+    7. hdfs
+      指定导出到hdfs
+          语法：
+          WITH HDFS ("key"="value"[,...])
+
+          可以指定如下参数：
+            fs.defaultFS: 指定HDFS的fs，格式为：hdfs://ip:port
+            hdfs_user：指定写入HDFS的user
+
 ## example

    1. 将 testTbl 表中的所有数据导出到 hdfs 上
@ -97,6 +106,9 @@ under the License.
    7. 将 testTbl 表的 k1, v1 列导出到本地。
        EXPORT TABLE testTbl TO "file:///home/data/a" PROPERTIES ("columns" = "k1,v1");

+    8. 将 testTbl 表中的所有数据导出到 hdfs 上，以不可见字符 "\x07" 作为列或者行分隔符。
+        EXPORT TABLE testTbl TO "hdfs://hdfs_host:port/a/b/c" PROPERTIES ("column_separator"="\\x07", "line_delimiter" = "\\x07") WITH HDFS ("fs.defaultFS"="hdfs://hdfs_host:port", "hdfs_user"="yyy")
+
 ## keyword
    EXPORT

--- a/docs/zh-CN/sql-reference/sql-statements/Data
+++ b/docs/zh-CN/sql-reference/sql-statements/Data
@ -0,0 +1,209 @@
+---
+{
+    "title": "OUTFILE",
+    "language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# OUTFILE
+## description
+
+    该语句用于使用 `SELECT INTO OUTFILE` 命令将查询结果的导出为文件。目前支持通过 Broker 进程, 通过 S3 协议, 或直接通过 HDFS 协议，导出到远端存储，如 HDFS，S3，BOS，COS（腾讯云）上。
+
+    语法：
+        query_stmt
+        INTO OUTFILE "file_path"
+        [format_as]
+        [properties]
+
+    1. file_path
+        file_path 指向文件存储的路径以及文件前缀。如 `hdfs://path/to/my_file_`。
+        最终的文件名将由 `my_file_`，文件序号以及文件格式后缀组成。其中文件序号由0开始，数量为文件被分割的数量。如：
+            my_file_abcdefg_0.csv
+            my_file_abcdefg_1.csv
+            my_file_abcdegf_2.csv
+
+    2. format_as
+        FORMAT AS CSV
+        指定导出格式。默认为 CSV。
+
+
+    3. properties
+        指定相关属性。目前支持通过 Broker 进程, 或通过 S3 协议进行导出。
+
+        语法：
+        [PROPERTIES ("key"="value", ...)]
+        支持如下属性：
+        column_separator: 列分隔符
+        line_delimiter: 行分隔符
+        max_file_size: 单个文件大小限制，如果结果超过这个值，将切割成多个文件。
+
+        Broker 相关属性需加前缀 `broker.`:
+        broker.name: broker名称
+        broker.hadoop.security.authentication: 指定认证方式为 kerberos
+        broker.kerberos_principal: 指定 kerberos 的 principal
+        broker.kerberos_keytab: 指定 kerberos 的 keytab 文件路径。该文件必须为 Broker 进程所在服务器上的文件的绝对路径。并且可以被 Broker 进程访问
+
+        HDFS 相关属性需加前缀 `hdfs.`:
+        hdfs.fs.defaultFS: namenode 地址和端口
+        hdfs.hdfs_user: hdfs 用户名
+
+        S3 协议则直接执行 S3 协议配置即可:
+        AWS_ENDPOINT
+        AWS_ACCESS_KEY
+        AWS_SECRET_KEY
+        AWS_REGION
+
+## example
+
+    1. 使用 broker 方式导出，将简单查询结果导出到文件 `hdfs://path/to/result.txt`。指定导出格式为 CSV。使用 `my_broker` 并设置 kerberos 认证信息。指定列分隔符为 `,`，行分隔符为 `\n`。
+    SELECT * FROM tbl
+    INTO OUTFILE "hdfs://path/to/result_"
+    FORMAT AS CSV
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.hadoop.security.authentication" = "kerberos",
+        "broker.kerberos_principal" = "doris@YOUR.COM",
+        "broker.kerberos_keytab" = "/home/doris/my.keytab",
+        "column_separator" = ",",
+        "line_delimiter" = "\n",
+        "max_file_size" = "100MB"
+    );
+    最终生成文件如如果不大于 100MB，则为：`result_0.csv`。
+    如果大于 100MB，则可能为 `result_0.csv, result_1.csv, ...`。
+
+    2. 将简单查询结果导出到文件 `hdfs://path/to/result.parquet`。指定导出格式为 PARQUET。使用 `my_broker` 并设置 kerberos 认证信息。
+    SELECT c1, c2, c3 FROM tbl
+    INTO OUTFILE "hdfs://path/to/result_"
+    FORMAT AS PARQUET
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.hadoop.security.authentication" = "kerberos",
+        "broker.kerberos_principal" = "doris@YOUR.COM",
+        "broker.kerberos_keytab" = "/home/doris/my.keytab",
+        "schema"="required,int32,c1;required,byte_array,c2;required,byte_array,c2"
+    );
+    查询结果导出到parquet文件需要明确指定`schema`。
+
+    3. 将 CTE 语句的查询结果导出到文件 `hdfs://path/to/result.txt`。默认导出格式为 CSV。使用 `my_broker` 并设置 hdfs 高可用信息。使用默认的行列分隔符。
+    WITH
+    x1 AS
+    (SELECT k1, k2 FROM tbl1),
+    x2 AS
+    (SELECT k3 FROM tbl2)
+    SELEC k1 FROM x1 UNION SELECT k3 FROM x2
+    INTO OUTFILE "hdfs://path/to/result_"
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.username"="user",
+        "broker.password"="passwd",
+        "broker.dfs.nameservices" = "my_ha",
+        "broker.dfs.ha.namenodes.my_ha" = "my_namenode1, my_namenode2",
+        "broker.dfs.namenode.rpc-address.my_ha.my_namenode1" = "nn1_host:rpc_port",
+        "broker.dfs.namenode.rpc-address.my_ha.my_namenode2" = "nn2_host:rpc_port",
+        "broker.dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
+    );
+    最终生成文件如如果不大于 1GB，则为：`result_0.csv`。
+    如果大于 1GB，则可能为 `result_0.csv, result_1.csv, ...`。
+    
+    4. 将 UNION 语句的查询结果导出到文件 `bos://bucket/result.txt`。指定导出格式为 PARQUET。使用 `my_broker` 并设置 hdfs 高可用信息。PARQUET 格式无需指定列分割符。
+    导出完成后，生成一个标识文件。
+    SELECT k1 FROM tbl1 UNION SELECT k2 FROM tbl1
+    INTO OUTFILE "bos://bucket/result_"
+    FORMAT AS PARQUET
+    PROPERTIES
+    (
+        "broker.name" = "my_broker",
+        "broker.bos_endpoint" = "http://bj.bcebos.com",
+        "broker.bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx",
+        "broker.bos_secret_accesskey" = "yyyyyyyyyyyyyyyyyyyyyyyyyy",
+        "schema"="required,int32,k1;required,byte_array,k2"
+    );
+
+    5. 将 select 语句的查询结果导出到文件 `cos://${bucket_name}/path/result.txt`。指定导出格式为 csv。
+    导出完成后，生成一个标识文件。
+    select k1,k2,v1 from tbl1 limit 100000
+    into outfile "s3a://my_bucket/export/my_file_"
+    FORMAT AS CSV
+    PROPERTIES
+    (
+        "broker.name" = "hdfs_broker",
+        "broker.fs.s3a.access.key" = "xxx",
+        "broker.fs.s3a.secret.key" = "xxxx",
+        "broker.fs.s3a.endpoint" = "https://cos.xxxxxx.myqcloud.com/",
+        "column_separator" = ",",
+        "line_delimiter" = "\n",
+        "max_file_size" = "1024MB",
+        "success_file_name" = "SUCCESS"
+    )
+    最终生成文件如如果不大于 1GB，则为：`my_file_0.csv`。
+    如果大于 1GB，则可能为 `my_file_0.csv, result_1.csv, ...`。
+    在cos上验证
+        1. 不存在的path会自动创建
+        2. access.key/secret.key/endpoint需要和cos的同学确认。尤其是endpoint的值，不需要填写bucket_name。
+
+    6. 使用 s3 协议导出到 bos，并且并发导出开启。
+    set enable_parallel_outfile = true;
+    select k1 from tb1 limit 1000
+    into outfile "s3://my_bucket/export/my_file_"
+    format as csv
+    properties
+    (
+        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
+        "AWS_ACCESS_KEY" = "xxxx",
+        "AWS_SECRET_KEY" = "xxx",
+        "AWS_REGION" = "bd"
+    )
+    最终生成的文件前缀为 `my_file_{fragment_instance_id}_`。
+
+    7. 使用 s3 协议导出到 bos，并且并发导出 session 变量开启。
+    注意：但由于查询语句带了一个顶层的排序节点，所以这个查询即使开启并发导出的 session 变量，也是无法并发导出的。
+    set enable_parallel_outfile = true;
+    select k1 from tb1 order by k1 limit 1000
+    into outfile "s3://my_bucket/export/my_file_"
+    format as csv
+    properties
+    (
+        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
+        "AWS_ACCESS_KEY" = "xxxx",
+        "AWS_SECRET_KEY" = "xxx",
+        "AWS_REGION" = "bd"
+    )
+
+    8. 使用 hdfs 方式导出，将简单查询结果导出到文件 `hdfs://path/to/result.txt`。指定导出格式为 CSV，用户名为work。指定列分隔符为 `,`，行分隔符为 `\n`。
+    SELECT * FROM tbl
+    INTO OUTFILE "hdfs://path/to/result_"
+    FORMAT AS CSV
+    PROPERTIES
+    (
+        "hdfs.fs.defaultFS" = "hdfs://ip:port",
+        "hdfs.hdfs_user" = "work"
+    );
+    最终生成文件如如果不大于 100MB，则为：`result_0.csv`。
+    如果大于 100MB，则可能为 `result_0.csv, result_1.csv, ...`。
+
+## keyword
+    OUTFILE
+