[Bug](iceberg)fix read partitioned iceberg without partition path (#25503)
Iceberg does not require partition values to exist on file paths, so we should get the partition value from `PartitionScanTask.partition`.
This commit is contained in:
@ -562,9 +562,19 @@ void IcebergTableReader::_gen_file_col_names() {
|
||||
auto name = _file_col_names[i];
|
||||
auto iter = _table_col_to_file_col.find(name);
|
||||
if (iter == _table_col_to_file_col.end()) {
|
||||
_all_required_col_names.emplace_back(name);
|
||||
// If the user creates the iceberg table, directly append the parquet file that already exists,
|
||||
// there is no 'iceberg.schema' field in the footer of parquet, the '_table_col_to_file_col' may be empty.
|
||||
// Because we are ignoring case, so, it is converted to lowercase here
|
||||
auto name_low = to_lower(name);
|
||||
_all_required_col_names.emplace_back(name_low);
|
||||
if (_has_iceberg_schema) {
|
||||
_not_in_file_col_names.emplace_back(name);
|
||||
} else {
|
||||
_table_col_to_file_col.emplace(name, name_low);
|
||||
_file_col_to_table_col.emplace(name_low, name);
|
||||
if (name != name_low) {
|
||||
_has_schema_change = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_all_required_col_names.emplace_back(iter->second);
|
||||
|
||||
@ -296,7 +296,8 @@ public class SlotDescriptor {
|
||||
public TSlotDescriptor toThrift() {
|
||||
// Non-nullable slots will have 0 for the byte offset and -1 for the bit mask
|
||||
TSlotDescriptor tSlotDescriptor = new TSlotDescriptor(id.asInt(), parent.getId().asInt(), type.toThrift(), -1,
|
||||
byteOffset, 0, getIsNullable() ? 0 : -1, ((column != null) ? column.getNonShadowName() : ""), slotIdx,
|
||||
byteOffset, 0, getIsNullable() ? 0 : -1,
|
||||
((column != null) ? column.getNonShadowName() : ""), slotIdx,
|
||||
isMaterialized);
|
||||
tSlotDescriptor.setNeedMaterialize(needMaterialize);
|
||||
tSlotDescriptor.setIsAutoIncrement(isAutoInc);
|
||||
|
||||
@ -62,8 +62,8 @@ import org.apache.iceberg.HistoryEntry;
|
||||
import org.apache.iceberg.MetadataColumns;
|
||||
import org.apache.iceberg.PartitionField;
|
||||
import org.apache.iceberg.Snapshot;
|
||||
import org.apache.iceberg.StructLike;
|
||||
import org.apache.iceberg.Table;
|
||||
import org.apache.iceberg.TableProperties;
|
||||
import org.apache.iceberg.TableScan;
|
||||
import org.apache.iceberg.exceptions.NotFoundException;
|
||||
import org.apache.iceberg.expressions.Expression;
|
||||
@ -86,7 +86,6 @@ import java.util.stream.Collectors;
|
||||
public class IcebergScanNode extends FileQueryScanNode {
|
||||
|
||||
public static final int MIN_DELETE_FILE_SUPPORT_VERSION = 2;
|
||||
public static final String DEFAULT_DATA_PATH = "/data/";
|
||||
private static final String TOTAL_RECORDS = "total-records";
|
||||
private static final String TOTAL_POSITION_DELETES = "total-position-deletes";
|
||||
private static final String TOTAL_EQUALITY_DELETES = "total-equality-deletes";
|
||||
@ -201,8 +200,6 @@ public class IcebergScanNode extends FileQueryScanNode {
|
||||
// Min split size is DEFAULT_SPLIT_SIZE(128MB).
|
||||
long splitSize = Math.max(ConnectContext.get().getSessionVariable().getFileSplitSize(), DEFAULT_SPLIT_SIZE);
|
||||
HashSet<String> partitionPathSet = new HashSet<>();
|
||||
String dataPath = normalizeLocation(icebergTable.location()) + icebergTable.properties()
|
||||
.getOrDefault(TableProperties.WRITE_DATA_LOCATION, DEFAULT_DATA_PATH);
|
||||
boolean isPartitionedTable = icebergTable.spec().isPartitioned();
|
||||
|
||||
CloseableIterable<FileScanTask> fileScanTasks = TableScanUtil.splitFiles(scan.planFiles(), splitSize);
|
||||
@ -211,12 +208,18 @@ public class IcebergScanNode extends FileQueryScanNode {
|
||||
combinedScanTasks.forEach(taskGrp -> taskGrp.files().forEach(splitTask -> {
|
||||
String dataFilePath = normalizeLocation(splitTask.file().path().toString());
|
||||
|
||||
// Counts the number of partitions read
|
||||
List<String> partitionValues = new ArrayList<>();
|
||||
if (isPartitionedTable) {
|
||||
int last = dataFilePath.lastIndexOf("/");
|
||||
if (last > 0) {
|
||||
partitionPathSet.add(dataFilePath.substring(dataPath.length(), last));
|
||||
StructLike structLike = splitTask.file().partition();
|
||||
|
||||
// set partitionValue for this IcebergSplit
|
||||
for (int i = 0; i < structLike.size(); i++) {
|
||||
String partition = String.valueOf(structLike.get(i, Object.class));
|
||||
partitionValues.add(partition);
|
||||
}
|
||||
|
||||
// Counts the number of partitions read
|
||||
partitionPathSet.add(structLike.toString());
|
||||
}
|
||||
|
||||
Path finalDataFilePath = S3Util.toScanRangeLocation(dataFilePath, source.getCatalog().getProperties());
|
||||
@ -227,7 +230,8 @@ public class IcebergScanNode extends FileQueryScanNode {
|
||||
splitTask.file().fileSizeInBytes(),
|
||||
new String[0],
|
||||
formatVersion,
|
||||
source.getCatalog().getProperties());
|
||||
source.getCatalog().getProperties(),
|
||||
partitionValues);
|
||||
if (formatVersion >= MIN_DELETE_FILE_SUPPORT_VERSION) {
|
||||
split.setDeleteFileFilters(getDeleteFileFilters(splitTask));
|
||||
}
|
||||
|
||||
@ -30,8 +30,9 @@ public class IcebergSplit extends FileSplit {
|
||||
|
||||
// File path will be changed if the file is modified, so there's no need to get modification time.
|
||||
public IcebergSplit(Path file, long start, long length, long fileLength, String[] hosts,
|
||||
Integer formatVersion, Map<String, String> config) {
|
||||
super(file, start, length, fileLength, hosts, null);
|
||||
Integer formatVersion, Map<String, String> config,
|
||||
List<String> partitionList) {
|
||||
super(file, start, length, fileLength, hosts, partitionList);
|
||||
this.formatVersion = formatVersion;
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@ -15,3 +15,12 @@
|
||||
1 Customer#000000001 j5JsirBM9P MOROCCO 0 MOROCCO AFRICA 25-989-741-2988 BUILDING
|
||||
3 Customer#000000003 fkRGN8n ARGENTINA7 ARGENTINA AMERICA 11-719-748-3364 AUTOMOBILE
|
||||
5 Customer#000000005 hwBtxkoBF qSW4KrI CANADA 5 CANADA AMERICA 13-750-942-6364 HOUSEHOLD
|
||||
|
||||
-- !q04 --
|
||||
1 1970-01-03 09:02:03.000001 a
|
||||
1 1970-01-03 09:02:03.000001 b
|
||||
2 1970-01-03 09:02:04.000001 c
|
||||
2 1970-01-03 09:02:04.000001 d
|
||||
|
||||
-- !q05 --
|
||||
463870
|
||||
|
||||
@ -36,8 +36,14 @@ suite("test_external_catalog_iceberg_hadoop_catalog", "p2,external,iceberg,exter
|
||||
qt_q02 """ select c_custkey from iceberg_hadoop_catalog group by c_custkey order by c_custkey limit 7 """
|
||||
qt_q03 """ select * from iceberg_hadoop_catalog order by c_custkey limit 3 """
|
||||
}
|
||||
|
||||
def q02 = {
|
||||
qt_q04 """ select * from multi_partition2 order by val """
|
||||
qt_q05 """ select count(*) from table_with_append_file where MAN_ID is not null """
|
||||
}
|
||||
|
||||
sql """ use `multi_catalog`; """
|
||||
q01()
|
||||
q02()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user