Analyze schema elements in parquet FileMetaData, and generate the hierarchy of nested fields.
For exmpale:
1. primitive type
```
// thrift:
optional int32 <column-name>;
// sql definition:
<column-name> int32;
```
2. nested type
```
// thrift:
optional group <column-name> (LIST) {
repeated group bag {
optional group array_element (LIST) {
repeated group bag {
optional int32 array_element
}
}
}
}
// sql definition:
<column-name> array<array<int32>>
```
129 lines
5.4 KiB
C++
129 lines
5.4 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include <glog/logging.h>
|
|
#include <gtest/gtest.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include <string>
|
|
|
|
#include "io/buffered_reader.h"
|
|
#include "io/file_reader.h"
|
|
#include "io/local_file_reader.h"
|
|
#include "util/runtime_profile.h"
|
|
#include "vec/exec/format/parquet/parquet_thrift_util.h"
|
|
#include "vec/exec/format/parquet/vparquet_file_metadata.h"
|
|
|
|
namespace doris {
|
|
namespace vectorized {
|
|
|
|
class ParquetThriftReaderTest : public testing::Test {
|
|
public:
|
|
ParquetThriftReaderTest() {}
|
|
};
|
|
|
|
TEST_F(ParquetThriftReaderTest, normal) {
|
|
LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/localfile.parquet", 0);
|
|
|
|
auto st = reader.open();
|
|
EXPECT_TRUE(st.ok());
|
|
|
|
std::shared_ptr<FileMetaData> metaData;
|
|
parse_thrift_footer(&reader, metaData);
|
|
tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata();
|
|
|
|
LOG(WARNING) << "num row groups: " << metaData->num_row_groups();
|
|
LOG(WARNING) << "num columns: " << metaData->num_columns();
|
|
LOG(WARNING) << "=====================================";
|
|
for (auto value : t_metadata.row_groups) {
|
|
LOG(WARNING) << "row group num_rows: " << value.num_rows;
|
|
}
|
|
LOG(WARNING) << "=====================================";
|
|
for (auto value : t_metadata.schema) {
|
|
LOG(WARNING) << "schema column name: " << value.name;
|
|
LOG(WARNING) << "schema column type: " << value.type;
|
|
LOG(WARNING) << "schema column repetition_type: " << value.repetition_type;
|
|
LOG(WARNING) << "schema column num children: " << value.num_children;
|
|
}
|
|
}
|
|
|
|
TEST_F(ParquetThriftReaderTest, complex_nested_file) {
|
|
// hive-complex.parquet is the part of following table:
|
|
// complex_nested_table(
|
|
// `name` string,
|
|
// `income` array<array<int>>,
|
|
// `hobby` array<map<string,string>>,
|
|
// `friend` map<string,string>,
|
|
// `mark` struct<math:int,english:int>)
|
|
LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/hive-complex.parquet", 0);
|
|
|
|
auto st = reader.open();
|
|
EXPECT_TRUE(st.ok());
|
|
|
|
std::shared_ptr<FileMetaData> metaData;
|
|
parse_thrift_footer(&reader, metaData);
|
|
tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata();
|
|
FieldDescriptor schemaDescriptor;
|
|
schemaDescriptor.parse_from_thrift(t_metadata.schema);
|
|
|
|
// table columns
|
|
ASSERT_EQ(schemaDescriptor.get_column_index("name"), 0);
|
|
auto name = schemaDescriptor.get_column("name");
|
|
ASSERT_TRUE(name->children.size() == 0 && name->physical_column_index >= 0);
|
|
ASSERT_TRUE(name->repetition_level == 0 && name->definition_level == 1);
|
|
|
|
ASSERT_EQ(schemaDescriptor.get_column_index("income"), 1);
|
|
auto income = schemaDescriptor.get_column("income");
|
|
// should be parsed as ARRAY<ARRAY<INT32>>
|
|
ASSERT_TRUE(income->type.type == TYPE_ARRAY);
|
|
ASSERT_TRUE(income->children.size() == 1);
|
|
ASSERT_TRUE(income->children[0].type.type == TYPE_ARRAY);
|
|
ASSERT_TRUE(income->children[0].children.size() == 1);
|
|
auto i_physical = income->children[0].children[0];
|
|
// five levels for ARRAY<ARRAY<INT32>>
|
|
// income --- bag --- array_element --- bag --- array_element
|
|
// opt rep opt rep opt
|
|
// R=0,D=1 R=1,D=2 R=1,D=3 R=2,D=4 R=2,D=5
|
|
ASSERT_TRUE(i_physical.repetition_level == 2 && i_physical.definition_level == 5);
|
|
|
|
ASSERT_EQ(schemaDescriptor.get_column_index("hobby"), 2);
|
|
auto hobby = schemaDescriptor.get_column("hobby");
|
|
// should be parsed as ARRAY<MAP<STRUCT<STRING,STRING>>>
|
|
ASSERT_TRUE(hobby->children.size() == 1 && hobby->children[0].children.size() == 1 &&
|
|
hobby->children[0].children[0].children.size() == 2);
|
|
ASSERT_TRUE(hobby->type.type == TYPE_ARRAY && hobby->children[0].type.type == TYPE_MAP &&
|
|
hobby->children[0].children[0].type.type == TYPE_STRUCT);
|
|
// hobby(opt) --- bag(rep) --- array_element(opt) --- map(rep)
|
|
// \------- key(req)
|
|
// \------- value(opt)
|
|
// R=0,D=1 R=1,D=2 R=1,D=3 R=2,D=4
|
|
// \------ R=2,D=4
|
|
// \------ R=2,D=5
|
|
auto h_key = hobby->children[0].children[0].children[0];
|
|
auto h_value = hobby->children[0].children[0].children[1];
|
|
ASSERT_TRUE(h_key.repetition_level == 2 && h_key.definition_level == 4);
|
|
ASSERT_TRUE(h_value.repetition_level == 2 && h_value.definition_level == 5);
|
|
|
|
ASSERT_EQ(schemaDescriptor.get_column_index("friend"), 3);
|
|
ASSERT_EQ(schemaDescriptor.get_column_index("mark"), 4);
|
|
}
|
|
|
|
} // namespace vectorized
|
|
|
|
} // namespace doris
|