[feature](tvf) Support compress file for tvf hdfs() and s3() (#19530)

We can support this by add a new properties for tvf, like :

`select * from hdfs("uri" = "xxx", ..., "compress_type" = "lz4", ...)`

User can:

Specify compression explicitly by setting `"compression" = "xxx"`.
Doris can infer the compression type by the suffix of file name(e.g. `file1.gz`)
Currently, we only support reading compress file in `csv` format, and on BE side, we already support.
All need to do is to analyze the `"compress_type"` on FE side and pass it to BE.
This commit is contained in:
Weijie Guo
2023-05-16 08:50:43 +08:00
committed by GitHub
parent 8284c342cb
commit 9535ed01aa
11 changed files with 93 additions and 6 deletions

View File

@ -552,8 +552,13 @@ public class Util {
}
}
/**
* Infer {@link TFileCompressType} from file name.
*
* @param path of file to be inferred.
*/
@NotNull
public static TFileCompressType getFileCompressType(String path) {
public static TFileCompressType inferFileCompressTypeByPath(String path) {
String lowerCasePath = path.toLowerCase();
if (lowerCasePath.endsWith(".gz")) {
return TFileCompressType.GZ;
@ -572,6 +577,20 @@ public class Util {
}
}
public static TFileCompressType getFileCompressType(String compressType) {
final String upperCaseType = compressType.toUpperCase();
return TFileCompressType.valueOf(upperCaseType);
}
/**
* Pass through the compressType if it is not {@link TFileCompressType#UNKNOWN}. Otherwise, return the
* inferred type from path.
*/
public static TFileCompressType getOrInferCompressType(TFileCompressType compressType, String path) {
return compressType == TFileCompressType.UNKNOWN
? inferFileCompressTypeByPath(path.toLowerCase()) : compressType;
}
public static boolean isCsvFormat(TFileFormatType fileFormatType) {
return fileFormatType == TFileFormatType.FORMAT_CSV_BZ2
|| fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE

View File

@ -334,7 +334,7 @@ public abstract class FileQueryScanNode extends FileScanNode {
}
protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws UserException {
return Util.getFileCompressType(fileSplit.getPath().toString());
return Util.inferFileCompressTypeByPath(fileSplit.getPath().toString());
}
protected TFileAttributes getFileAttributes() throws UserException {

View File

@ -23,12 +23,14 @@ import org.apache.doris.catalog.TableIf;
import org.apache.doris.common.DdlException;
import org.apache.doris.common.MetaNotFoundException;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.Util;
import org.apache.doris.planner.PlanNodeId;
import org.apache.doris.spi.Split;
import org.apache.doris.statistics.StatisticalType;
import org.apache.doris.tablefunction.ExternalFileTableValuedFunction;
import org.apache.doris.thrift.TBrokerFileStatus;
import org.apache.doris.thrift.TFileAttributes;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileType;
@ -83,6 +85,12 @@ public class TVFScanNode extends FileQueryScanNode {
return tableValuedFunction.getTFileFormatType();
}
@Override
protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws UserException {
TFileCompressType fileCompressType = tableValuedFunction.getTFileCompressType();
return Util.getOrInferCompressType(fileCompressType, fileSplit.getPath().toString());
}
@Override
public TFileType getLocationType() throws DdlException, MetaNotFoundException {
return tableValuedFunction.getTFileType();

View File

@ -34,6 +34,7 @@ import org.apache.doris.common.FeNameFormat;
import org.apache.doris.common.Pair;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.BrokerUtil;
import org.apache.doris.common.util.Util;
import org.apache.doris.datasource.property.constants.S3Properties;
import org.apache.doris.planner.PlanNodeId;
import org.apache.doris.planner.ScanNode;
@ -48,6 +49,7 @@ import org.apache.doris.rpc.RpcException;
import org.apache.doris.system.Backend;
import org.apache.doris.thrift.TBrokerFileStatus;
import org.apache.doris.thrift.TFileAttributes;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileRangeDesc;
import org.apache.doris.thrift.TFileScanRange;
@ -95,6 +97,7 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
protected static final String TRIM_DOUBLE_QUOTES = "trim_double_quotes";
protected static final String SKIP_LINES = "skip_lines";
protected static final String CSV_SCHEMA = "csv_schema";
protected static final String COMPRESS_TYPE = "compress_type";
// decimal(p,s)
private static final Pattern DECIMAL_TYPE_PATTERN = Pattern.compile("decimal\\((\\d+),(\\d+)\\)");
// datetime(p)
@ -124,6 +127,7 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
protected Map<String, String> locationProperties;
private TFileFormatType fileFormatType;
private TFileCompressType compressionType;
private String headerType = "";
private String columnSeparator = DEFAULT_COLUMN_SEPARATOR;
@ -147,6 +151,10 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
return fileFormatType;
}
public TFileCompressType getTFileCompressType() {
return compressionType;
}
public Map<String, String> getLocationProperties() {
return locationProperties;
}
@ -212,7 +220,11 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
fuzzyParse = Boolean.valueOf(validParams.get(FUZZY_PARSE)).booleanValue();
trimDoubleQuotes = Boolean.valueOf(validParams.get(TRIM_DOUBLE_QUOTES)).booleanValue();
skipLines = Integer.valueOf(validParams.getOrDefault(SKIP_LINES, "0")).intValue();
try {
compressionType = Util.getFileCompressType(validParams.getOrDefault(COMPRESS_TYPE, "UNKNOWN"));
} catch (IllegalArgumentException e) {
throw new AnalysisException("Compress type : " + validParams.get(COMPRESS_TYPE) + " is not supported.");
}
if (formatString.equals("csv") || formatString.equals("csv_with_names")
|| formatString.equals("csv_with_names_and_types")) {
parseCsvSchema(csvSchema, validParams);
@ -451,6 +463,7 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
throw new AnalysisException("Can not get first file, please check uri.");
}
fileScanRangeParams.setCompressType(Util.getOrInferCompressType(compressionType, firstFile.getPath()));
// set TFileRangeDesc
TFileRangeDesc fileRangeDesc = new TFileRangeDesc();
fileRangeDesc.setPath(firstFile.getPath());