[Fix](Outfile) Fix the column type mapping in the orc/parquet file format (#32281)

| Doris Type             | Orc Type                     |  Parquet Type                |
|---------------------|--------------------|------------------------|
| Date                            | Long (logical: DATE)                 |       int32 (Logical: Date)                                        |
| DateTime                    | TIMESTAMP (logical: TIMESTAMP)    |       int96                          |
This commit is contained in:
Tiewei Fang
2024-03-21 14:59:21 +08:00
committed by yiguolei
parent ca05129fa1
commit d7a3ff1ddf
54 changed files with 743 additions and 1026 deletions

View File

@ -297,15 +297,21 @@ public class OutFileClause {
}
orcType = "string";
break;
case DATEV2:
orcType = "date";
break;
case DATETIMEV2:
orcType = "timestamp";
break;
case CHAR:
orcType = "char(" + dorisType.getLength() + ")";
break;
case VARCHAR:
orcType = "varchar(" + dorisType.getLength() + ")";
break;
case LARGEINT:
case DATE:
case DATETIME:
case DATEV2:
case CHAR:
case VARCHAR:
orcType = "string";
break;
case DECIMALV2:
@ -402,74 +408,50 @@ public class OutFileClause {
case FLOAT:
case DOUBLE:
case STRING:
if (!schema.second.equals(resultType.getPrimitiveType().toString().toLowerCase())) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use " + resultType.getPrimitiveType().toString() + ","
+ " but the type of column " + i + " is " + schema.second);
}
checkOrcType(schema.second, resultType.getPrimitiveType().toString().toLowerCase(), true,
resultType.getPrimitiveType().toString());
break;
case DATEV2:
checkOrcType(schema.second, "date", true, resultType.getPrimitiveType().toString());
break;
case DATETIMEV2:
if (!schema.second.equals("timestamp")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use timestamp, but the definition type of column " + i + " is "
+ schema.second);
}
checkOrcType(schema.second, "timestamp", true, resultType.getPrimitiveType().toString());
break;
case CHAR:
checkOrcType(schema.second, "char", false, resultType.getPrimitiveType().toString());
break;
case VARCHAR:
checkOrcType(schema.second, "varchar", false, resultType.getPrimitiveType().toString());
break;
case LARGEINT:
case DATE:
case DATETIME:
case DATEV2:
case CHAR:
case VARCHAR:
if (!schema.second.equals("string")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use string, but the definition type of column " + i + " is "
+ schema.second);
}
checkOrcType(schema.second, "string", true, resultType.getPrimitiveType().toString());
break;
case DECIMAL32:
case DECIMAL64:
case DECIMAL128:
case DECIMALV2:
if (!schema.second.startsWith("decimal")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use string, but the definition type of column " + i + " is "
+ schema.second);
}
checkOrcType(schema.second, "decimal", false, resultType.getPrimitiveType().toString());
break;
case HLL:
case BITMAP:
if (ConnectContext.get() != null && ConnectContext.get()
.getSessionVariable().isReturnObjectDataAsBinary()) {
if (!schema.second.equals("string")) {
throw new AnalysisException("project field type is HLL/BITMAP, should use string, "
+ "but the definition type of column " + i + " is " + schema.second);
}
checkOrcType(schema.second, "string", true, resultType.getPrimitiveType().toString());
} else {
throw new AnalysisException("Orc format does not support column type: "
+ resultType.getPrimitiveType());
}
break;
case STRUCT:
if (!schema.second.startsWith("struct")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use struct, but the definition type of column " + i + " is "
+ schema.second);
}
checkOrcType(schema.second, "struct", false, resultType.getPrimitiveType().toString());
break;
case MAP:
if (!schema.second.startsWith("map")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use map, but the definition type of column " + i + " is "
+ schema.second);
}
checkOrcType(schema.second, "map", false, resultType.getPrimitiveType().toString());
break;
case ARRAY:
if (!schema.second.startsWith("array")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use array, but the definition type of column " + i + " is "
+ schema.second);
}
checkOrcType(schema.second, "array", false, resultType.getPrimitiveType().toString());
break;
default:
throw new AnalysisException("Orc format does not support column type: "
@ -478,6 +460,22 @@ public class OutFileClause {
}
}
private void checkOrcType(String orcType, String expectType, boolean isEqual, String dorisType)
throws AnalysisException {
if (isEqual) {
if (orcType.equals(expectType)) {
return;
}
} else {
if (orcType.startsWith(expectType)) {
return;
}
}
throw new AnalysisException("project field type is " + dorisType
+ ", should use " + expectType + ", but the definition type is " + orcType);
}
private void analyzeForParquetFormat(List<Expr> resultExprs, List<String> colLabels) throws AnalysisException {
if (this.parquetSchemas.isEmpty()) {
genParquetColumnName(resultExprs, colLabels);