[feature](stream load) (step one)Add arrow data type for stream load (#26709)

By using the Arrow data format, we can reduce the streamload of data transferred and improve the data import performance
This commit is contained in:
wuwenchi
2023-12-06 23:29:46 +08:00
committed by GitHub
parent 2ca66ff61c
commit 54d062ddee
15 changed files with 452 additions and 13 deletions

View File

@ -357,6 +357,9 @@ public class DataDescription implements InsertStmt.DataDesc {
case FORMAT_WAL:
this.fileFormat = "wal";
break;
case FORMAT_ARROW:
this.fileFormat = "arrow";
break;
default:
this.fileFormat = "unknown";
break;
@ -1124,6 +1127,7 @@ public class DataDescription implements InsertStmt.DataDesc {
&& !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_ORC)
&& !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_JSON)
&& !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_WAL)
&& !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_ARROW)
&& !fileFormat.equalsIgnoreCase(FileFormatConstants.FORMAT_HIVE_TEXT)) {
throw new AnalysisException("File Format Type " + fileFormat + " is invalid.");
}

View File

@ -33,6 +33,7 @@ public class FileFormatConstants {
public static final String FORMAT_JSON = "json";
public static final String FORMAT_AVRO = "avro";
public static final String FORMAT_WAL = "wal";
public static final String FORMAT_ARROW = "arrow";
public static final String PROP_FORMAT = "format";
public static final String PROP_COLUMN_SEPARATOR = "column_separator";

View File

@ -564,8 +564,10 @@ public class Util {
// TODO: Add TEXTFILE to TFileFormatType to Support hive text file format.
|| lowerFileFormat.equals(FileFormatConstants.FORMAT_HIVE_TEXT)) {
return TFileFormatType.FORMAT_CSV_PLAIN;
} else if (lowerFileFormat.equals("wal")) {
} else if (lowerFileFormat.equals(FileFormatConstants.FORMAT_WAL)) {
return TFileFormatType.FORMAT_WAL;
} else if (lowerFileFormat.equals(FileFormatConstants.FORMAT_ARROW)) {
return TFileFormatType.FORMAT_ARROW;
} else {
return TFileFormatType.FORMAT_UNKNOWN;
}

View File

@ -344,10 +344,12 @@ public class Load {
for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
columnExprMap.put(importColumnDesc.getColumnName(), importColumnDesc.getExpr());
}
HashMap<String, Type> colToType = new HashMap<>();
// check default value and auto-increment column
for (Column column : tbl.getBaseSchema()) {
String columnName = column.getName();
colToType.put(columnName, column.getType());
if (columnExprMap.containsKey(columnName)) {
continue;
}
@ -427,9 +429,15 @@ public class Load {
exprsByName.put(realColName, expr);
} else {
SlotDescriptor slotDesc = analyzer.getDescTbl().addSlotDescriptor(srcTupleDesc);
// columns default be varchar type
slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
if (formatType == TFileFormatType.FORMAT_ARROW) {
slotDesc.setColumn(new Column(realColName, colToType.get(realColName)));
} else {
// columns default be varchar type
slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
}
// ISSUE A: src slot should be nullable even if the column is not nullable.
// because src slot is what we read from file, not represent to real column value.
// If column is not nullable, error will be thrown when filling the dest slot,