[fix](ParquetReader) definition level of repeated parent is wrong (#17337)

Fix three bugs:
1.  `repeated_parent_def_level ` should be the definition of its repeated parent.
2. Failed to parse schema like `decimal(p, s)`
3. Fill wrong offsets for array type
This commit is contained in:
Ashin Gau
2023-03-06 18:15:57 +08:00
committed by GitHub
parent 843b4b5746
commit dca16796ad
3 changed files with 21 additions and 16 deletions

View File

@ -55,11 +55,11 @@ static int num_children_node(const tparquet::SchemaElement& schema) {
return schema.__isset.num_children ? schema.num_children : 0;
}
static void set_child_node_level(FieldSchema* parent, size_t rep_inc = 0, size_t def_inc = 0) {
static void set_child_node_level(FieldSchema* parent, int16_t repeated_parent_def_level) {
for (auto& child : parent->children) {
child.repetition_level = parent->repetition_level + rep_inc;
child.definition_level = parent->definition_level + def_inc;
child.repeated_parent_def_level = parent->definition_level;
child.repetition_level = parent->repetition_level;
child.definition_level = parent->definition_level;
child.repeated_parent_def_level = repeated_parent_def_level;
}
}
@ -129,7 +129,7 @@ Status FieldDescriptor::parse_node_field(const std::vector<tparquet::SchemaEleme
node_field->repetition_level++;
node_field->definition_level++;
node_field->children.resize(1);
set_child_node_level(node_field);
set_child_node_level(node_field, node_field->definition_level);
auto child = &node_field->children[0];
parse_physical_field(t_schema, false, child);
@ -315,7 +315,7 @@ Status FieldDescriptor::parse_group_field(const std::vector<tparquet::SchemaElem
group_field->repetition_level++;
group_field->definition_level++;
group_field->children.resize(1);
set_child_node_level(group_field);
set_child_node_level(group_field, group_field->definition_level);
auto struct_field = &group_field->children[0];
// the list of struct:
// repeated group <name> (LIST) {
@ -379,16 +379,16 @@ Status FieldDescriptor::parse_list_field(const std::vector<tparquet::SchemaEleme
// optional field, and the third level element is the nested structure in list
// produce nested structure like: LIST<INT>, LIST<MAP>, LIST<LIST<...>>
// skip bag/list, it's a repeated element.
set_child_node_level(list_field);
set_child_node_level(list_field, list_field->definition_level);
RETURN_IF_ERROR(parse_node_field(t_schemas, curr_pos + 2, list_child));
} else {
// required field, produce the list of struct
set_child_node_level(list_field);
set_child_node_level(list_field, list_field->definition_level);
RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos + 1, list_child));
}
} else if (num_children == 0) {
// required two level list, for compatibility reason.
set_child_node_level(list_field);
set_child_node_level(list_field, list_field->definition_level);
parse_physical_field(second_level, false, list_child);
_next_schema_pos = curr_pos + 2;
}
@ -450,7 +450,7 @@ Status FieldDescriptor::parse_map_field(const std::vector<tparquet::SchemaElemen
map_field->definition_level++;
map_field->children.resize(1);
set_child_node_level(map_field);
set_child_node_level(map_field, map_field->repeated_parent_def_level);
auto map_kv_field = &map_field->children[0];
// produce MAP<STRUCT<KEY, VALUE>>
RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos + 1, map_kv_field));
@ -473,7 +473,7 @@ Status FieldDescriptor::parse_struct_field(const std::vector<tparquet::SchemaEle
}
auto num_children = struct_schema.num_children;
struct_field->children.resize(num_children);
set_child_node_level(struct_field);
set_child_node_level(struct_field, struct_field->repeated_parent_def_level);
_next_schema_pos = curr_pos + 1;
for (int i = 0; i < num_children; ++i) {
RETURN_IF_ERROR(parse_node_field(t_schemas, _next_schema_pos, &struct_field->children[i]));

View File

@ -40,7 +40,7 @@ static void fill_struct_null_map(FieldSchema* field, NullMap& null_map,
DCHECK_EQ(num_levels, rep_levels.size());
size_t origin_size = null_map.size();
null_map.resize(origin_size + num_levels);
size_t pos = 0;
size_t pos = origin_size;
for (size_t i = 0; i < num_levels; ++i) {
// skip the levels affect its ancestor or its descendants
if (def_levels[i] < field->repeated_parent_def_level ||
@ -53,7 +53,7 @@ static void fill_struct_null_map(FieldSchema* field, NullMap& null_map,
null_map[pos++] = 1;
}
}
null_map.resize(origin_size + pos);
null_map.resize(pos + 1);
}
static void fill_array_offset(FieldSchema* field, ColumnArray::Offsets64& offsets_data,
@ -88,9 +88,9 @@ static void fill_array_offset(FieldSchema* field, ColumnArray::Offsets64& offset
(*null_map_ptr)[offset_pos] = 1;
}
}
offsets_data.resize(origin_size + offset_pos + 1);
offsets_data.resize(offset_pos + 1);
if (null_map_ptr != nullptr) {
null_map_ptr->resize(origin_size + offset_pos + 1);
null_map_ptr->resize(offset_pos + 1);
}
}

View File

@ -676,13 +676,18 @@ public class HiveMetaStoreClientHelper {
*/
private static int findNextNestedField(String commaSplitFields) {
int numLess = 0;
int numBracket = 0;
for (int i = 0; i < commaSplitFields.length(); i++) {
char c = commaSplitFields.charAt(i);
if (c == '<') {
numLess++;
} else if (c == '>') {
numLess--;
} else if (c == ',' && numLess == 0) {
} else if (c == '(') {
numBracket++;
} else if (c == ')') {
numBracket--;
} else if (c == ',' && numLess == 0 && numBracket == 0) {
return i;
}
}