[feature](stream load) (step one)Add arrow data type for stream load (#26709)

By using the Arrow data format, we can reduce the streamload of data transferred and improve the data import performance
2023-12-06 23:29:46 +08:00
parent 2ca66ff61c
commit 54d062ddee
15 changed files with 452 additions and 13 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DataDescription.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DataDescription.java
@ -357,6 +357,9 @@ public class DataDescription implements InsertStmt.DataDesc {
                    case FORMAT_WAL:
                        this.fileFormat = "wal";
                        break;
+                    case FORMAT_ARROW:
+                        this.fileFormat = "arrow";
+                        break;
                    default:
                        this.fileFormat = "unknown";
                        break;
@ -1124,6 +1127,7 @@ public class DataDescription implements InsertStmt.DataDesc {
                    && !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_ORC)
                    && !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_JSON)
                    && !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_WAL)
+                    && !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_ARROW)
                    && !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_HIVE_TEXT)) {
                throw new AnalysisException("File Format Type " + fileFormat + " is invalid.");
            }
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
@ -33,6 +33,7 @@ public class FileFormatConstants {
    public static final String FORMAT_JSON = "json";
    public static final String FORMAT_AVRO = "avro";
    public static final String FORMAT_WAL = "wal";
+    public static final String FORMAT_ARROW = "arrow";

    public static final String PROP_FORMAT = "format";
    public static final String PROP_COLUMN_SEPARATOR = "column_separator";
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java
@ -564,8 +564,10 @@ public class Util {
                // TODO: Add TEXTFILE to TFileFormatType to Support hive text file format.
                || lowerFileFormat.equals(FileFormatConstants.FORMAT_HIVE_TEXT)) {
            return TFileFormatType.FORMAT_CSV_PLAIN;
-        } else if (lowerFileFormat.equals("wal")) {
+        } else if (lowerFileFormat.equals(FileFormatConstants.FORMAT_WAL)) {
            return TFileFormatType.FORMAT_WAL;
+        } else if (lowerFileFormat.equals(FileFormatConstants.FORMAT_ARROW)) {
+            return TFileFormatType.FORMAT_ARROW;
        } else {
            return TFileFormatType.FORMAT_UNKNOWN;
        }
--- a/fe/fe-core/src/main/java/org/apache/doris/load/Load.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/Load.java
@ -344,10 +344,12 @@ public class Load {
        for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
            columnExprMap.put(importColumnDesc.getColumnName(), importColumnDesc.getExpr());
        }
+        HashMap<String, Type> colToType = new HashMap<>();

        // check default value and auto-increment column
        for (Column column : tbl.getBaseSchema()) {
            String columnName = column.getName();
+            colToType.put(columnName, column.getType());
            if (columnExprMap.containsKey(columnName)) {
                continue;
            }
@ -427,9 +429,15 @@ public class Load {
                exprsByName.put(realColName, expr);
            } else {
                SlotDescriptor slotDesc = analyzer.getDescTbl().addSlotDescriptor(srcTupleDesc);
-                // columns default be varchar type
-                slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
-                slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
+
+                if (formatType == TFileFormatType.FORMAT_ARROW) {
+                    slotDesc.setColumn(new Column(realColName, colToType.get(realColName)));
+                } else {
+                    // columns default be varchar type
+                    slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                    slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
+                }
+
                // ISSUE A: src slot should be nullable even if the column is not nullable.
                // because src slot is what we read from file, not represent to real column value.
                // If column is not nullable, error will be thrown when filling the dest slot,