[fix](csv-reader) fix bug that csv reader can not read text format hms table (#13515)
1. Missing field and line delimiter
2. When query external table with text(csv) format, we should pass the column position map to BE,
otherwise the column order is wrong.
TODO:
1. For now, if we query csv file with non-exist column, it will return null.
But it should return null or default value of that column.
2. Add regression test after hive docker is ready.
This commit is contained in:
@ -810,6 +810,8 @@ public class HiveMetaStoreClientHelper {
|
||||
return Type.FLOAT;
|
||||
case "double":
|
||||
return Type.DOUBLE;
|
||||
case "string":
|
||||
return Type.STRING;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -923,3 +925,4 @@ public class HiveMetaStoreClientHelper {
|
||||
return output.toString();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -73,6 +73,17 @@ public interface TableIf {
|
||||
|
||||
Column getColumn(String name);
|
||||
|
||||
default int getBaseColumnIdxByName(String colName) {
|
||||
int i = 0;
|
||||
for (Column col : getBaseSchema()) {
|
||||
if (col.getName().equalsIgnoreCase(colName)) {
|
||||
return i;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
String getMysqlType();
|
||||
|
||||
String getEngine();
|
||||
@ -163,3 +174,4 @@ public interface TableIf {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -311,3 +311,4 @@ public class HMSExternalTable extends ExternalTable {
|
||||
return catalog.getCatalogProperty().getS3Properties();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -70,6 +70,7 @@ public class HMSExternalCatalog extends ExternalCatalog {
|
||||
client = new HiveMetaStoreClient(hiveConf);
|
||||
} catch (MetaException e) {
|
||||
LOG.warn("Failed to create HiveMetaStoreClient: {}", e.getMessage());
|
||||
return;
|
||||
}
|
||||
List<String> allDatabases;
|
||||
try {
|
||||
|
||||
@ -278,6 +278,7 @@ public class ExternalFileScanNode extends ExternalScanNode {
|
||||
ParamCreateContext context = contexts.get(i);
|
||||
FileScanProviderIf scanProvider = scanProviders.get(i);
|
||||
setDefaultValueExprs(scanProvider, context);
|
||||
setColumnPositionMappingForTextFile(scanProvider, context);
|
||||
finalizeParamsForLoad(context, analyzer);
|
||||
createScanRangeLocations(context, scanProvider);
|
||||
this.inputSplitsNum += scanProvider.getInputSplitNum();
|
||||
@ -285,6 +286,27 @@ public class ExternalFileScanNode extends ExternalScanNode {
|
||||
}
|
||||
}
|
||||
|
||||
private void setColumnPositionMappingForTextFile(FileScanProviderIf scanProvider, ParamCreateContext context)
|
||||
throws UserException {
|
||||
if (type != Type.QUERY) {
|
||||
return;
|
||||
}
|
||||
TableIf tbl = scanProvider.getTargetTable();
|
||||
List<Integer> columnIdxs = Lists.newArrayList();
|
||||
for (SlotDescriptor slot : desc.getSlots()) {
|
||||
if (!slot.isMaterialized()) {
|
||||
continue;
|
||||
}
|
||||
String colName = slot.getColumn().getName();
|
||||
int idx = tbl.getBaseColumnIdxByName(colName);
|
||||
if (idx == -1) {
|
||||
throw new UserException("Column " + colName + " not found in table " + tbl.getName());
|
||||
}
|
||||
columnIdxs.add(idx);
|
||||
}
|
||||
context.params.setColumnIdxs(columnIdxs);
|
||||
}
|
||||
|
||||
protected void setDefaultValueExprs(FileScanProviderIf scanProvider, ParamCreateContext context)
|
||||
throws UserException {
|
||||
TableIf tbl = scanProvider.getTargetTable();
|
||||
@ -320,7 +342,7 @@ public class ExternalFileScanNode extends ExternalScanNode {
|
||||
default:
|
||||
Preconditions.checkState(false, type);
|
||||
}
|
||||
// if slot desc is null, which mean it is a unrelated slot, just skip.
|
||||
// if slot desc is null, which mean it is an unrelated slot, just skip.
|
||||
// eg:
|
||||
// (a, b, c) set (x=a, y=b, z=c)
|
||||
// c does not exist in file, the z will be filled with null, even if z has default value.
|
||||
@ -499,5 +521,3 @@ public class ExternalFileScanNode extends ExternalScanNode {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@ -35,11 +35,13 @@ import org.apache.doris.load.BrokerFileGroup;
|
||||
import org.apache.doris.planner.external.ExternalFileScanNode.ParamCreateContext;
|
||||
import org.apache.doris.system.Backend;
|
||||
import org.apache.doris.thrift.TExternalScanRange;
|
||||
import org.apache.doris.thrift.TFileAttributes;
|
||||
import org.apache.doris.thrift.TFileFormatType;
|
||||
import org.apache.doris.thrift.TFileRangeDesc;
|
||||
import org.apache.doris.thrift.TFileScanRange;
|
||||
import org.apache.doris.thrift.TFileScanRangeParams;
|
||||
import org.apache.doris.thrift.TFileScanSlotInfo;
|
||||
import org.apache.doris.thrift.TFileTextScanRangeParams;
|
||||
import org.apache.doris.thrift.TFileType;
|
||||
import org.apache.doris.thrift.THdfsParams;
|
||||
import org.apache.doris.thrift.TNetworkAddress;
|
||||
@ -76,6 +78,10 @@ import java.util.stream.Collectors;
|
||||
public class HiveScanProvider implements HMSTableScanProviderIf {
|
||||
private static final Logger LOG = LogManager.getLogger(HiveScanProvider.class);
|
||||
|
||||
private static final String PROP_FIELD_DELIMITER = "field.delim";
|
||||
private static final String DEFAULT_FIELD_DELIMITER = "|";
|
||||
private static final String DEFAULT_LINE_DELIMITER = "\n";
|
||||
|
||||
protected HMSExternalTable hmsTable;
|
||||
|
||||
protected int inputSplitNum = 0;
|
||||
@ -268,7 +274,20 @@ public class HiveScanProvider implements HMSTableScanProviderIf {
|
||||
String fsName = fullPath.replace(filePath, "");
|
||||
TFileType locationType = getLocationType();
|
||||
context.params.setFileType(locationType);
|
||||
TFileFormatType fileFormatType = getFileFormatType();
|
||||
context.params.setFormatType(getFileFormatType());
|
||||
if (fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) {
|
||||
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
|
||||
String columnSeparator
|
||||
= hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters()
|
||||
.getOrDefault(PROP_FIELD_DELIMITER, DEFAULT_FIELD_DELIMITER);
|
||||
textParams.setColumnSeparator(columnSeparator);
|
||||
textParams.setLineDelimiter(DEFAULT_LINE_DELIMITER);
|
||||
TFileAttributes fileAttributes = new TFileAttributes();
|
||||
fileAttributes.setTextParams(textParams);
|
||||
context.params.setFileAttributes(fileAttributes);
|
||||
}
|
||||
|
||||
// set hdfs params for hdfs file type.
|
||||
Map<String, String> locationProperties = getLocationProperties();
|
||||
if (locationType == TFileType.FILE_HDFS) {
|
||||
@ -364,3 +383,4 @@ public class HiveScanProvider implements HMSTableScanProviderIf {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user