[Fix](Outfile) Fix the column type mapping in the orc/parquet file format (#32281)

| Doris Type | Orc Type | Parquet Type | |---------------------|--------------------|------------------------| | Date | Long (logical: DATE) | int32 (Logical: Date) | | DateTime | TIMESTAMP (logical: TIMESTAMP) | int96 |
2024-03-21 14:59:21 +08:00
parent ca05129fa1
commit d7a3ff1ddf
54 changed files with 743 additions and 1026 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
@ -297,15 +297,21 @@ public class OutFileClause {
                }
                orcType = "string";
                break;
+            case DATEV2:
+                orcType = "date";
+                break;
            case DATETIMEV2:
                orcType = "timestamp";
                break;
+            case CHAR:
+                orcType = "char(" + dorisType.getLength() + ")";
+                break;
+            case VARCHAR:
+                orcType = "varchar(" + dorisType.getLength() + ")";
+                break;
            case LARGEINT:
            case DATE:
            case DATETIME:
-            case DATEV2:
-            case CHAR:
-            case VARCHAR:
                orcType = "string";
                break;
            case DECIMALV2:
@ -402,74 +408,50 @@ public class OutFileClause {
                case FLOAT:
                case DOUBLE:
                case STRING:
-                    if (!schema.second.equals(resultType.getPrimitiveType().toString().toLowerCase())) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use " + resultType.getPrimitiveType().toString() + ","
-                                + " but the type of column " + i + " is " + schema.second);
-                    }
+                    checkOrcType(schema.second, resultType.getPrimitiveType().toString().toLowerCase(), true,
+                            resultType.getPrimitiveType().toString());
+                    break;
+                case DATEV2:
+                    checkOrcType(schema.second, "date", true, resultType.getPrimitiveType().toString());
                    break;
                case DATETIMEV2:
-                    if (!schema.second.equals("timestamp")) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use timestamp, but the definition type of column " + i + " is "
-                                + schema.second);
-                    }
+                    checkOrcType(schema.second, "timestamp", true, resultType.getPrimitiveType().toString());
+                    break;
+                case CHAR:
+                    checkOrcType(schema.second, "char", false, resultType.getPrimitiveType().toString());
+                    break;
+                case VARCHAR:
+                    checkOrcType(schema.second, "varchar", false, resultType.getPrimitiveType().toString());
                    break;
                case LARGEINT:
                case DATE:
                case DATETIME:
-                case DATEV2:
-                case CHAR:
-                case VARCHAR:
-                    if (!schema.second.equals("string")) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use string, but the definition type of column " + i + " is "
-                                + schema.second);
-                    }
+                    checkOrcType(schema.second, "string", true, resultType.getPrimitiveType().toString());
                    break;
                case DECIMAL32:
                case DECIMAL64:
                case DECIMAL128:
                case DECIMALV2:
-                    if (!schema.second.startsWith("decimal")) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use string, but the definition type of column " + i + " is "
-                                + schema.second);
-                    }
+                    checkOrcType(schema.second, "decimal", false, resultType.getPrimitiveType().toString());
                    break;
                case HLL:
                case BITMAP:
                    if (ConnectContext.get() != null && ConnectContext.get()
                            .getSessionVariable().isReturnObjectDataAsBinary()) {
-                        if (!schema.second.equals("string")) {
-                            throw new AnalysisException("project field type is HLL/BITMAP, should use string, "
-                                    + "but the definition type of column " + i + " is " + schema.second);
-                        }
+                        checkOrcType(schema.second, "string", true, resultType.getPrimitiveType().toString());
                    } else {
                        throw new AnalysisException("Orc format does not support column type: "
                                + resultType.getPrimitiveType());
                    }
                    break;
                case STRUCT:
-                    if (!schema.second.startsWith("struct")) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use struct, but the definition type of column " + i + " is "
-                                + schema.second);
-                    }
+                    checkOrcType(schema.second, "struct", false, resultType.getPrimitiveType().toString());
                    break;
                case MAP:
-                    if (!schema.second.startsWith("map")) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use map, but the definition type of column " + i + " is "
-                                + schema.second);
-                    }
+                    checkOrcType(schema.second, "map", false, resultType.getPrimitiveType().toString());
                    break;
                case ARRAY:
-                    if (!schema.second.startsWith("array")) {
-                        throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
-                                + ", should use array, but the definition type of column " + i + " is "
-                                + schema.second);
-                    }
+                    checkOrcType(schema.second, "array", false, resultType.getPrimitiveType().toString());
                    break;
                default:
                    throw new AnalysisException("Orc format does not support column type: "
@ -478,6 +460,22 @@ public class OutFileClause {
        }
    }

+    private void checkOrcType(String orcType, String expectType, boolean isEqual, String dorisType)
+            throws AnalysisException {
+        if (isEqual) {
+            if (orcType.equals(expectType)) {
+                return;
+            }
+        } else {
+            if (orcType.startsWith(expectType)) {
+                return;
+            }
+        }
+        throw new AnalysisException("project field type is " + dorisType
+                + ", should use " + expectType + ", but the definition type is " + orcType);
+    }
+
+
    private void analyzeForParquetFormat(List<Expr> resultExprs, List<String> colLabels) throws AnalysisException {
        if (this.parquetSchemas.isEmpty()) {
            genParquetColumnName(resultExprs, colLabels);