[fix](csv-reader) fix bug that csv reader can not read text format hms table (#13515)

1. Missing field and line delimiter 2. When query external table with text(csv) format, we should pass the column position map to BE, otherwise the column order is wrong. TODO: 1. For now, if we query csv file with non-exist column, it will return null. But it should return null or default value of that column. 2. Add regression test after hive docker is ready.
2022-10-22 22:40:03 +08:00
parent a7c221d04e
commit 3a3def447d
11 changed files with 147 additions and 57 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
@ -810,6 +810,8 @@ public class HiveMetaStoreClientHelper {
                return Type.FLOAT;
            case "double":
                return Type.DOUBLE;
+            case "string":
+                return Type.STRING;
            default:
                break;
        }
@ -923,3 +925,4 @@ public class HiveMetaStoreClientHelper {
        return output.toString();
    }
 }
+
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
@ -73,6 +73,17 @@ public interface TableIf {

    Column getColumn(String name);

+    default int getBaseColumnIdxByName(String colName) {
+        int i = 0;
+        for (Column col : getBaseSchema()) {
+            if (col.getName().equalsIgnoreCase(colName)) {
+                return i;
+            }
+            ++i;
+        }
+        return -1;
+    }
+
    String getMysqlType();

    String getEngine();
@ -163,3 +174,4 @@ public interface TableIf {
        }
    }
 }
+
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@ -311,3 +311,4 @@ public class HMSExternalTable extends ExternalTable {
        return catalog.getCatalogProperty().getS3Properties();
    }
 }
+
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java
@ -70,6 +70,7 @@ public class HMSExternalCatalog extends ExternalCatalog {
            client = new HiveMetaStoreClient(hiveConf);
        } catch (MetaException e) {
            LOG.warn("Failed to create HiveMetaStoreClient: {}", e.getMessage());
+            return;
        }
        List<String> allDatabases;
        try {
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
@ -278,6 +278,7 @@ public class ExternalFileScanNode extends ExternalScanNode {
            ParamCreateContext context = contexts.get(i);
            FileScanProviderIf scanProvider = scanProviders.get(i);
            setDefaultValueExprs(scanProvider, context);
+            setColumnPositionMappingForTextFile(scanProvider, context);
            finalizeParamsForLoad(context, analyzer);
            createScanRangeLocations(context, scanProvider);
            this.inputSplitsNum += scanProvider.getInputSplitNum();
@ -285,6 +286,27 @@ public class ExternalFileScanNode extends ExternalScanNode {
        }
    }

+    private void setColumnPositionMappingForTextFile(FileScanProviderIf scanProvider, ParamCreateContext context)
+            throws UserException {
+        if (type != Type.QUERY) {
+            return;
+        }
+        TableIf tbl = scanProvider.getTargetTable();
+        List<Integer> columnIdxs = Lists.newArrayList();
+        for (SlotDescriptor slot : desc.getSlots()) {
+            if (!slot.isMaterialized()) {
+                continue;
+            }
+            String colName = slot.getColumn().getName();
+            int idx = tbl.getBaseColumnIdxByName(colName);
+            if (idx == -1) {
+                throw new UserException("Column " + colName + " not found in table " + tbl.getName());
+            }
+            columnIdxs.add(idx);
+        }
+        context.params.setColumnIdxs(columnIdxs);
+    }
+
    protected void setDefaultValueExprs(FileScanProviderIf scanProvider, ParamCreateContext context)
            throws UserException {
        TableIf tbl = scanProvider.getTargetTable();
@ -320,7 +342,7 @@ public class ExternalFileScanNode extends ExternalScanNode {
                default:
                    Preconditions.checkState(false, type);
            }
-            // if slot desc is null, which mean it is a unrelated slot, just skip.
+            // if slot desc is null, which mean it is an unrelated slot, just skip.
            // eg:
            // (a, b, c) set (x=a, y=b, z=c)
            // c does not exist in file, the z will be filled with null, even if z has default value.
@ -499,5 +521,3 @@ public class ExternalFileScanNode extends ExternalScanNode {
    }
 }

-
-
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanProvider.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanProvider.java
@ -35,11 +35,13 @@ import org.apache.doris.load.BrokerFileGroup;
 import org.apache.doris.planner.external.ExternalFileScanNode.ParamCreateContext;
 import org.apache.doris.system.Backend;
 import org.apache.doris.thrift.TExternalScanRange;
+import org.apache.doris.thrift.TFileAttributes;
 import org.apache.doris.thrift.TFileFormatType;
 import org.apache.doris.thrift.TFileRangeDesc;
 import org.apache.doris.thrift.TFileScanRange;
 import org.apache.doris.thrift.TFileScanRangeParams;
 import org.apache.doris.thrift.TFileScanSlotInfo;
+import org.apache.doris.thrift.TFileTextScanRangeParams;
 import org.apache.doris.thrift.TFileType;
 import org.apache.doris.thrift.THdfsParams;
 import org.apache.doris.thrift.TNetworkAddress;
@ -76,6 +78,10 @@ import java.util.stream.Collectors;
 public class HiveScanProvider implements HMSTableScanProviderIf {
    private static final Logger LOG = LogManager.getLogger(HiveScanProvider.class);

+    private static final String PROP_FIELD_DELIMITER = "field.delim";
+    private static final String DEFAULT_FIELD_DELIMITER = "|";
+    private static final String DEFAULT_LINE_DELIMITER = "\n";
+
    protected HMSExternalTable hmsTable;

    protected int inputSplitNum = 0;
@ -268,7 +274,20 @@ public class HiveScanProvider implements HMSTableScanProviderIf {
            String fsName = fullPath.replace(filePath, "");
            TFileType locationType = getLocationType();
            context.params.setFileType(locationType);
+            TFileFormatType fileFormatType = getFileFormatType();
            context.params.setFormatType(getFileFormatType());
+            if (fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) {
+                TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
+                String columnSeparator
+                        = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters()
+                        .getOrDefault(PROP_FIELD_DELIMITER, DEFAULT_FIELD_DELIMITER);
+                textParams.setColumnSeparator(columnSeparator);
+                textParams.setLineDelimiter(DEFAULT_LINE_DELIMITER);
+                TFileAttributes fileAttributes = new TFileAttributes();
+                fileAttributes.setTextParams(textParams);
+                context.params.setFileAttributes(fileAttributes);
+            }
+
            // set hdfs params for hdfs file type.
            Map<String, String> locationProperties = getLocationProperties();
            if (locationType == TFileType.FILE_HDFS) {
@ -364,3 +383,4 @@ public class HiveScanProvider implements HMSTableScanProviderIf {

 }

+