[feature](Paimon) support deletion vector for Paimon naive reader (#34743) (#35241)

bp #34743
Co-authored-by: 苏小刚 <suxiaogang223@icloud.com>
This commit is contained in:
Mingyu Chen
2024-05-23 00:01:30 +08:00
committed by GitHub
parent 50f50cf8cc
commit adc364a6fd
20 changed files with 590 additions and 174 deletions

View File

@ -38,6 +38,7 @@ import org.apache.doris.thrift.TExplainLevel;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileRangeDesc;
import org.apache.doris.thrift.TFileType;
import org.apache.doris.thrift.TPaimonDeletionFileDesc;
import org.apache.doris.thrift.TPaimonFileDesc;
import org.apache.doris.thrift.TScanRangeLocations;
import org.apache.doris.thrift.TTableFormatFileDesc;
@ -51,6 +52,7 @@ import org.apache.paimon.data.BinaryRow;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.source.DataSplit;
import org.apache.paimon.table.source.DeletionFile;
import org.apache.paimon.table.source.RawFile;
import org.apache.paimon.table.source.ReadBuilder;
import org.apache.paimon.utils.InstantiationUtil;
@ -66,11 +68,46 @@ import java.util.Set;
import java.util.stream.Collectors;
public class PaimonScanNode extends FileQueryScanNode {
private enum SplitReadType {
JNI,
NATIVE,
}
private class SplitStat {
SplitReadType type = SplitReadType.JNI;
private long rowCount = 0;
private boolean rawFileConvertable = false;
private boolean hasDeletionVector = false;
public void setType(SplitReadType type) {
this.type = type;
}
public void setRowCount(long rowCount) {
this.rowCount = rowCount;
}
public void setRawFileConvertable(boolean rawFileConvertable) {
this.rawFileConvertable = rawFileConvertable;
}
public void setHasDeletionVector(boolean hasDeletionVector) {
this.hasDeletionVector = hasDeletionVector;
}
@Override
public String toString() {
return "SplitStat [type=" + type + ", rowCount=" + rowCount + ", rawFileConvertable=" + rawFileConvertable
+ ", hasDeletionVector=" + hasDeletionVector + "]";
}
}
private static final Logger LOG = LogManager.getLogger(PaimonScanNode.class);
private PaimonSource source = null;
private List<Predicate> predicates;
private int rawFileSplitNum = 0;
private int paimonSplitNum = 0;
private List<SplitStat> splitStats = new ArrayList<>();
public PaimonScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumnPriv) {
super(id, desc, "PAIMON_SCAN_NODE", StatisticalType.PAIMON_SCAN_NODE, needCheckColumnPriv);
@ -133,6 +170,15 @@ public class PaimonScanNode extends FileQueryScanNode {
fileDesc.setDbId(((PaimonExternalTable) source.getTargetTable()).getDbId());
fileDesc.setTblId(source.getTargetTable().getId());
fileDesc.setLastUpdateTime(source.getTargetTable().getLastUpdateTime());
Optional<DeletionFile> optDeletionFile = paimonSplit.getDeletionFile();
if (optDeletionFile.isPresent()) {
DeletionFile deletionFile = optDeletionFile.get();
TPaimonDeletionFileDesc tDeletionFile = new TPaimonDeletionFileDesc();
tDeletionFile.setPath(deletionFile.path());
tDeletionFile.setOffset(deletionFile.offset());
tDeletionFile.setLength(deletionFile.length());
fileDesc.setDeletionFile(tDeletionFile);
}
tableFormatFileDesc.setPaimonParams(fileDesc);
rangeDesc.setTableFormatParams(tableFormatFileDesc);
}
@ -152,19 +198,28 @@ public class PaimonScanNode extends FileQueryScanNode {
// Just for counting the number of selected partitions for this paimon table
Set<BinaryRow> selectedPartitionValues = Sets.newHashSet();
for (org.apache.paimon.table.source.Split split : paimonSplits) {
SplitStat splitStat = new SplitStat();
splitStat.setRowCount(split.rowCount());
if (!forceJniScanner && supportNative && split instanceof DataSplit) {
DataSplit dataSplit = (DataSplit) split;
BinaryRow partitionValue = dataSplit.partition();
selectedPartitionValues.add(partitionValue);
Optional<List<RawFile>> optRawFiles = dataSplit.convertToRawFiles();
Optional<List<DeletionFile>> optDeletionFiles = dataSplit.deletionFiles();
if (optRawFiles.isPresent()) {
splitStat.setType(SplitReadType.NATIVE);
splitStat.setRawFileConvertable(true);
List<RawFile> rawFiles = optRawFiles.get();
for (RawFile file : rawFiles) {
LocationPath locationPath = new LocationPath(file.path(), source.getCatalog().getProperties());
Path finalDataFilePath = locationPath.toStorageLocation();
try {
splits.addAll(
splitFile(
if (optDeletionFiles.isPresent()) {
List<DeletionFile> deletionFiles = optDeletionFiles.get();
for (int i = 0; i < rawFiles.size(); i++) {
RawFile file = rawFiles.get(i);
DeletionFile deletionFile = deletionFiles.get(i);
LocationPath locationPath = new LocationPath(file.path(),
source.getCatalog().getProperties());
Path finalDataFilePath = locationPath.toStorageLocation();
try {
List<Split> dorisSplits = splitFile(
finalDataFilePath,
0,
null,
@ -172,10 +227,40 @@ public class PaimonScanNode extends FileQueryScanNode {
-1,
true,
null,
PaimonSplit.PaimonSplitCreator.DEFAULT));
++rawFileSplitNum;
} catch (IOException e) {
throw new UserException("Paimon error to split file: " + e.getMessage(), e);
PaimonSplit.PaimonSplitCreator.DEFAULT);
for (Split dorisSplit : dorisSplits) {
// the element in DeletionFiles might be null
if (deletionFile != null) {
splitStat.setHasDeletionVector(true);
((PaimonSplit) dorisSplit).setDeletionFile(deletionFile);
}
splits.add(dorisSplit);
}
++rawFileSplitNum;
} catch (IOException e) {
throw new UserException("Paimon error to split file: " + e.getMessage(), e);
}
}
} else {
for (RawFile file : rawFiles) {
LocationPath locationPath = new LocationPath(file.path(),
source.getCatalog().getProperties());
Path finalDataFilePath = locationPath.toStorageLocation();
try {
splits.addAll(
splitFile(
finalDataFilePath,
0,
null,
file.length(),
-1,
true,
null,
PaimonSplit.PaimonSplitCreator.DEFAULT));
++rawFileSplitNum;
} catch (IOException e) {
throw new UserException("Paimon error to split file: " + e.getMessage(), e);
}
}
}
} else {
@ -186,6 +271,7 @@ public class PaimonScanNode extends FileQueryScanNode {
splits.add(new PaimonSplit(split));
++paimonSplitNum;
}
splitStats.add(splitStat);
}
this.readPartitionNum = selectedPartitionValues.size();
// TODO: get total partition number
@ -260,8 +346,15 @@ public class PaimonScanNode extends FileQueryScanNode {
@Override
public String getNodeExplainString(String prefix, TExplainLevel detailLevel) {
return super.getNodeExplainString(prefix, detailLevel)
String result = super.getNodeExplainString(prefix, detailLevel)
+ String.format("%spaimonNativeReadSplits=%d/%d\n",
prefix, rawFileSplitNum, (paimonSplitNum + rawFileSplitNum));
prefix, rawFileSplitNum, (paimonSplitNum + rawFileSplitNum));
if (detailLevel == TExplainLevel.VERBOSE) {
result += prefix + "PaimonSplitStats: \n";
for (SplitStat splitStat : splitStats) {
result += String.format("%s %s\n", prefix, splitStat);
}
}
return result;
}
}

View File

@ -22,24 +22,29 @@ import org.apache.doris.datasource.SplitCreator;
import org.apache.doris.datasource.TableFormatType;
import org.apache.hadoop.fs.Path;
import org.apache.paimon.table.source.DeletionFile;
import org.apache.paimon.table.source.Split;
import java.util.List;
import java.util.Optional;
public class PaimonSplit extends FileSplit {
private Split split;
private TableFormatType tableFormatType;
private Optional<DeletionFile> optDeletionFile;
public PaimonSplit(Split split) {
super(new Path("hdfs://dummyPath"), 0, 0, 0, null, null);
this.split = split;
this.tableFormatType = TableFormatType.PAIMON;
this.optDeletionFile = Optional.empty();
}
public PaimonSplit(Path file, long start, long length, long fileLength, String[] hosts,
List<String> partitionList) {
List<String> partitionList) {
super(file, start, length, fileLength, hosts, partitionList);
this.tableFormatType = TableFormatType.PAIMON;
this.optDeletionFile = Optional.empty();
}
public Split getSplit() {
@ -58,18 +63,26 @@ public class PaimonSplit extends FileSplit {
this.tableFormatType = tableFormatType;
}
public Optional<DeletionFile> getDeletionFile() {
return optDeletionFile;
}
public void setDeletionFile(DeletionFile deletionFile) {
this.optDeletionFile = Optional.of(deletionFile);
}
public static class PaimonSplitCreator implements SplitCreator {
static final PaimonSplitCreator DEFAULT = new PaimonSplitCreator();
@Override
public org.apache.doris.spi.Split create(Path path,
long start,
long length,
long fileLength,
long modificationTime,
String[] hosts,
List<String> partitionValues) {
long start,
long length,
long fileLength,
long modificationTime,
String[] hosts,
List<String> partitionValues) {
return new PaimonSplit(path, start, length, fileLength, hosts, partitionValues);
}
}