[fix](csv-reader) fix bug that csv reader can not read text format hms table (#13515)

1. Missing field and line delimiter
2. When query external table with text(csv) format, we should pass the column position map to BE,
    otherwise the column order is wrong.

TODO:
1. For now, if we query csv file with non-exist column, it will return null.
    But it should return null or default value of that column.
2. Add regression test after hive docker is ready.
This commit is contained in:
Mingyu Chen
2022-10-22 22:40:03 +08:00
committed by GitHub
parent a7c221d04e
commit 3a3def447d
11 changed files with 147 additions and 57 deletions

View File

@ -810,6 +810,8 @@ public class HiveMetaStoreClientHelper {
return Type.FLOAT;
case "double":
return Type.DOUBLE;
case "string":
return Type.STRING;
default:
break;
}
@ -923,3 +925,4 @@ public class HiveMetaStoreClientHelper {
return output.toString();
}
}

View File

@ -73,6 +73,17 @@ public interface TableIf {
Column getColumn(String name);
default int getBaseColumnIdxByName(String colName) {
int i = 0;
for (Column col : getBaseSchema()) {
if (col.getName().equalsIgnoreCase(colName)) {
return i;
}
++i;
}
return -1;
}
String getMysqlType();
String getEngine();
@ -163,3 +174,4 @@ public interface TableIf {
}
}
}

View File

@ -311,3 +311,4 @@ public class HMSExternalTable extends ExternalTable {
return catalog.getCatalogProperty().getS3Properties();
}
}

View File

@ -70,6 +70,7 @@ public class HMSExternalCatalog extends ExternalCatalog {
client = new HiveMetaStoreClient(hiveConf);
} catch (MetaException e) {
LOG.warn("Failed to create HiveMetaStoreClient: {}", e.getMessage());
return;
}
List<String> allDatabases;
try {

View File

@ -278,6 +278,7 @@ public class ExternalFileScanNode extends ExternalScanNode {
ParamCreateContext context = contexts.get(i);
FileScanProviderIf scanProvider = scanProviders.get(i);
setDefaultValueExprs(scanProvider, context);
setColumnPositionMappingForTextFile(scanProvider, context);
finalizeParamsForLoad(context, analyzer);
createScanRangeLocations(context, scanProvider);
this.inputSplitsNum += scanProvider.getInputSplitNum();
@ -285,6 +286,27 @@ public class ExternalFileScanNode extends ExternalScanNode {
}
}
private void setColumnPositionMappingForTextFile(FileScanProviderIf scanProvider, ParamCreateContext context)
throws UserException {
if (type != Type.QUERY) {
return;
}
TableIf tbl = scanProvider.getTargetTable();
List<Integer> columnIdxs = Lists.newArrayList();
for (SlotDescriptor slot : desc.getSlots()) {
if (!slot.isMaterialized()) {
continue;
}
String colName = slot.getColumn().getName();
int idx = tbl.getBaseColumnIdxByName(colName);
if (idx == -1) {
throw new UserException("Column " + colName + " not found in table " + tbl.getName());
}
columnIdxs.add(idx);
}
context.params.setColumnIdxs(columnIdxs);
}
protected void setDefaultValueExprs(FileScanProviderIf scanProvider, ParamCreateContext context)
throws UserException {
TableIf tbl = scanProvider.getTargetTable();
@ -320,7 +342,7 @@ public class ExternalFileScanNode extends ExternalScanNode {
default:
Preconditions.checkState(false, type);
}
// if slot desc is null, which mean it is a unrelated slot, just skip.
// if slot desc is null, which mean it is an unrelated slot, just skip.
// eg:
// (a, b, c) set (x=a, y=b, z=c)
// c does not exist in file, the z will be filled with null, even if z has default value.
@ -499,5 +521,3 @@ public class ExternalFileScanNode extends ExternalScanNode {
}
}

View File

@ -35,11 +35,13 @@ import org.apache.doris.load.BrokerFileGroup;
import org.apache.doris.planner.external.ExternalFileScanNode.ParamCreateContext;
import org.apache.doris.system.Backend;
import org.apache.doris.thrift.TExternalScanRange;
import org.apache.doris.thrift.TFileAttributes;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileRangeDesc;
import org.apache.doris.thrift.TFileScanRange;
import org.apache.doris.thrift.TFileScanRangeParams;
import org.apache.doris.thrift.TFileScanSlotInfo;
import org.apache.doris.thrift.TFileTextScanRangeParams;
import org.apache.doris.thrift.TFileType;
import org.apache.doris.thrift.THdfsParams;
import org.apache.doris.thrift.TNetworkAddress;
@ -76,6 +78,10 @@ import java.util.stream.Collectors;
public class HiveScanProvider implements HMSTableScanProviderIf {
private static final Logger LOG = LogManager.getLogger(HiveScanProvider.class);
private static final String PROP_FIELD_DELIMITER = "field.delim";
private static final String DEFAULT_FIELD_DELIMITER = "|";
private static final String DEFAULT_LINE_DELIMITER = "\n";
protected HMSExternalTable hmsTable;
protected int inputSplitNum = 0;
@ -268,7 +274,20 @@ public class HiveScanProvider implements HMSTableScanProviderIf {
String fsName = fullPath.replace(filePath, "");
TFileType locationType = getLocationType();
context.params.setFileType(locationType);
TFileFormatType fileFormatType = getFileFormatType();
context.params.setFormatType(getFileFormatType());
if (fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
String columnSeparator
= hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters()
.getOrDefault(PROP_FIELD_DELIMITER, DEFAULT_FIELD_DELIMITER);
textParams.setColumnSeparator(columnSeparator);
textParams.setLineDelimiter(DEFAULT_LINE_DELIMITER);
TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
context.params.setFileAttributes(fileAttributes);
}
// set hdfs params for hdfs file type.
Map<String, String> locationProperties = getLocationProperties();
if (locationType == TFileType.FILE_HDFS) {
@ -364,3 +383,4 @@ public class HiveScanProvider implements HMSTableScanProviderIf {
}