[fix](ParquetReader) definition level of repeated parent is wrong (#17337)
Fix three bugs: 1. `repeated_parent_def_level ` should be the definition of its repeated parent. 2. Failed to parse schema like `decimal(p, s)` 3. Fill wrong offsets for array type
This commit is contained in:
@ -55,11 +55,11 @@ static int num_children_node(const tparquet::SchemaElement& schema) {
|
||||
return schema.__isset.num_children ? schema.num_children : 0;
|
||||
}
|
||||
|
||||
static void set_child_node_level(FieldSchema* parent, size_t rep_inc = 0, size_t def_inc = 0) {
|
||||
static void set_child_node_level(FieldSchema* parent, int16_t repeated_parent_def_level) {
|
||||
for (auto& child : parent->children) {
|
||||
child.repetition_level = parent->repetition_level + rep_inc;
|
||||
child.definition_level = parent->definition_level + def_inc;
|
||||
child.repeated_parent_def_level = parent->definition_level;
|
||||
child.repetition_level = parent->repetition_level;
|
||||
child.definition_level = parent->definition_level;
|
||||
child.repeated_parent_def_level = repeated_parent_def_level;
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,7 +129,7 @@ Status FieldDescriptor::parse_node_field(const std::vector<tparquet::SchemaEleme
|
||||
node_field->repetition_level++;
|
||||
node_field->definition_level++;
|
||||
node_field->children.resize(1);
|
||||
set_child_node_level(node_field);
|
||||
set_child_node_level(node_field, node_field->definition_level);
|
||||
auto child = &node_field->children[0];
|
||||
parse_physical_field(t_schema, false, child);
|
||||
|
||||
@ -315,7 +315,7 @@ Status FieldDescriptor::parse_group_field(const std::vector<tparquet::SchemaElem
|
||||
group_field->repetition_level++;
|
||||
group_field->definition_level++;
|
||||
group_field->children.resize(1);
|
||||
set_child_node_level(group_field);
|
||||
set_child_node_level(group_field, group_field->definition_level);
|
||||
auto struct_field = &group_field->children[0];
|
||||
// the list of struct:
|
||||
// repeated group <name> (LIST) {
|
||||
@ -379,16 +379,16 @@ Status FieldDescriptor::parse_list_field(const std::vector<tparquet::SchemaEleme
|
||||
// optional field, and the third level element is the nested structure in list
|
||||
// produce nested structure like: LIST<INT>, LIST<MAP>, LIST<LIST<...>>
|
||||
// skip bag/list, it's a repeated element.
|
||||
set_child_node_level(list_field);
|
||||
set_child_node_level(list_field, list_field->definition_level);
|
||||
RETURN_IF_ERROR(parse_node_field(t_schemas, curr_pos + 2, list_child));
|
||||
} else {
|
||||
// required field, produce the list of struct
|
||||
set_child_node_level(list_field);
|
||||
set_child_node_level(list_field, list_field->definition_level);
|
||||
RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos + 1, list_child));
|
||||
}
|
||||
} else if (num_children == 0) {
|
||||
// required two level list, for compatibility reason.
|
||||
set_child_node_level(list_field);
|
||||
set_child_node_level(list_field, list_field->definition_level);
|
||||
parse_physical_field(second_level, false, list_child);
|
||||
_next_schema_pos = curr_pos + 2;
|
||||
}
|
||||
@ -450,7 +450,7 @@ Status FieldDescriptor::parse_map_field(const std::vector<tparquet::SchemaElemen
|
||||
map_field->definition_level++;
|
||||
|
||||
map_field->children.resize(1);
|
||||
set_child_node_level(map_field);
|
||||
set_child_node_level(map_field, map_field->repeated_parent_def_level);
|
||||
auto map_kv_field = &map_field->children[0];
|
||||
// produce MAP<STRUCT<KEY, VALUE>>
|
||||
RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos + 1, map_kv_field));
|
||||
@ -473,7 +473,7 @@ Status FieldDescriptor::parse_struct_field(const std::vector<tparquet::SchemaEle
|
||||
}
|
||||
auto num_children = struct_schema.num_children;
|
||||
struct_field->children.resize(num_children);
|
||||
set_child_node_level(struct_field);
|
||||
set_child_node_level(struct_field, struct_field->repeated_parent_def_level);
|
||||
_next_schema_pos = curr_pos + 1;
|
||||
for (int i = 0; i < num_children; ++i) {
|
||||
RETURN_IF_ERROR(parse_node_field(t_schemas, _next_schema_pos, &struct_field->children[i]));
|
||||
|
||||
@ -40,7 +40,7 @@ static void fill_struct_null_map(FieldSchema* field, NullMap& null_map,
|
||||
DCHECK_EQ(num_levels, rep_levels.size());
|
||||
size_t origin_size = null_map.size();
|
||||
null_map.resize(origin_size + num_levels);
|
||||
size_t pos = 0;
|
||||
size_t pos = origin_size;
|
||||
for (size_t i = 0; i < num_levels; ++i) {
|
||||
// skip the levels affect its ancestor or its descendants
|
||||
if (def_levels[i] < field->repeated_parent_def_level ||
|
||||
@ -53,7 +53,7 @@ static void fill_struct_null_map(FieldSchema* field, NullMap& null_map,
|
||||
null_map[pos++] = 1;
|
||||
}
|
||||
}
|
||||
null_map.resize(origin_size + pos);
|
||||
null_map.resize(pos + 1);
|
||||
}
|
||||
|
||||
static void fill_array_offset(FieldSchema* field, ColumnArray::Offsets64& offsets_data,
|
||||
@ -88,9 +88,9 @@ static void fill_array_offset(FieldSchema* field, ColumnArray::Offsets64& offset
|
||||
(*null_map_ptr)[offset_pos] = 1;
|
||||
}
|
||||
}
|
||||
offsets_data.resize(origin_size + offset_pos + 1);
|
||||
offsets_data.resize(offset_pos + 1);
|
||||
if (null_map_ptr != nullptr) {
|
||||
null_map_ptr->resize(origin_size + offset_pos + 1);
|
||||
null_map_ptr->resize(offset_pos + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -676,13 +676,18 @@ public class HiveMetaStoreClientHelper {
|
||||
*/
|
||||
private static int findNextNestedField(String commaSplitFields) {
|
||||
int numLess = 0;
|
||||
int numBracket = 0;
|
||||
for (int i = 0; i < commaSplitFields.length(); i++) {
|
||||
char c = commaSplitFields.charAt(i);
|
||||
if (c == '<') {
|
||||
numLess++;
|
||||
} else if (c == '>') {
|
||||
numLess--;
|
||||
} else if (c == ',' && numLess == 0) {
|
||||
} else if (c == '(') {
|
||||
numBracket++;
|
||||
} else if (c == ')') {
|
||||
numBracket--;
|
||||
} else if (c == ',' && numLess == 0 && numBracket == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user