[Fix](tvf) Fix that tvf reading empty files in compressed formats. (#34926)

1. Fix the issue with tvf reading empty compressed files.
2. move two test cases (`test_local_tvf_compression` and `test_s3_tvf_compression`) from p2 to p0
This commit is contained in:
Tiewei Fang
2024-05-21 09:57:15 +08:00
committed by yiguolei
parent 944d9bd4bd
commit c0fd98abe5
13 changed files with 78 additions and 10 deletions

View File

@ -482,7 +482,7 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
// get first file, used to parse table schema
TBrokerFileStatus firstFile = null;
for (TBrokerFileStatus fileStatus : fileStatuses) {
if (fileStatus.isIsDir() || fileStatus.size == 0) {
if (isFileContentEmpty(fileStatus)) {
continue;
}
firstFile = fileStatus;
@ -514,5 +514,43 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio
return InternalService.PFetchTableSchemaRequest.newBuilder()
.setFileScanRange(ByteString.copyFrom(new TSerializer().serialize(fileScanRange))).build();
}
private boolean isFileContentEmpty(TBrokerFileStatus fileStatus) {
if (fileStatus.isIsDir() || fileStatus.size == 0) {
return true;
}
if (Util.isCsvFormat(fileFormatType) || fileFormatType == TFileFormatType.FORMAT_JSON) {
int magicNumberBytes = 0;
switch (compressionType) {
case GZ:
magicNumberBytes = 20;
break;
case LZO:
case LZOP:
magicNumberBytes = 42;
break;
case DEFLATE:
magicNumberBytes = 8;
break;
case SNAPPYBLOCK:
case LZ4BLOCK:
case LZ4FRAME:
magicNumberBytes = 4;
break;
case BZ2:
magicNumberBytes = 14;
break;
case UNKNOWN:
case PLAIN:
default:
break;
}
// fileStatus.size may be -1 in http_stream
if (fileStatus.size >= 0 && fileStatus.size <= magicNumberBytes) {
return true;
}
}
return false;
}
}