[feature-wip] (parquet-reader) add parquet reader impl template (#11285)

This commit is contained in:
slothever
2022-07-29 14:30:31 +08:00
committed by GitHub
parent 642499265c
commit e4bc3f6b6f
20 changed files with 777 additions and 1 deletions

View File

@ -221,7 +221,15 @@ set(VEC_FILES
exec/file_arrow_scanner.cpp
exec/file_scanner.cpp
exec/file_scan_node.cpp
exec/file_text_scanner.cpp)
exec/file_text_scanner.cpp
exec/file_hdfs_scanner.cpp
exec/format/parquet/vparquet_column_chunk_reader.cpp
exec/format/parquet/vparquet_group_reader.cpp
exec/format/parquet/vparquet_page_index.cpp
exec/format/parquet/vparquet_reader.cpp
exec/format/parquet/vparquet_file_metadata.cpp
exec/format/parquet/vparquet_page_reader.cpp
exec/format/parquet/schema_desc.cpp)
add_library(Vec STATIC
${VEC_FILES}

View File

@ -0,0 +1,20 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "file_hdfs_scanner.h"
namespace doris::vectorized {} // namespace doris::vectorized

View File

@ -0,0 +1,26 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
namespace doris::vectorized {
class HdfsFileScanner {};
class ParquetFileHdfsScanner : public HdfsFileScanner {};
} // namespace doris::vectorized

View File

@ -0,0 +1,87 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <common/status.h>
#include <cstdint>
#include "common/logging.h"
#include "gen_cpp/parquet_types.h"
#include "io/file_reader.h"
#include "util/coding.h"
#include "util/thrift_util.h"
#include "vparquet_file_metadata.h"
namespace doris::vectorized {
constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024;
constexpr uint32_t PARQUET_FOOTER_SIZE = 8;
Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData>& file_metadata) {
// try with buffer on stack
uint8_t buff[PARQUET_FOOTER_READ_SIZE];
int64_t file_size = file->size();
// read footer bytes
uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE);
int64_t bytes_read = 0;
RETURN_IF_ERROR(
file->readat(file_size - footer_read_size, footer_read_size, &bytes_read, buff));
// validate magic
uint8_t* magic_ptr = buff + footer_read_size - sizeof(PARQUET_VERSION_NUMBER);
if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) {
return Status::Corruption("Invalid magic number in parquet file");
}
// get metadata_size
uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE;
uint32_t metadata_size = decode_fixed32_le(footer_buff);
if (metadata_size > file_size - PARQUET_FOOTER_SIZE) {
Status::Corruption("Parquet file size is ", file_size,
" bytes, smaller than the size reported by footer's (", metadata_size,
"bytes)");
}
tparquet::FileMetaData t_metadata;
// deserialize footer
RETURN_IF_ERROR(
deserialize_thrift_msg(footer_buff - metadata_size, &metadata_size, true, &t_metadata));
file_metadata.reset(new FileMetaData(t_metadata));
RETURN_IF_ERROR(file_metadata->init_schema());
return Status::OK();
}
// Status parse_page_header() {
// uint8_t* page_buf;
//
// }
// Status parse_page_index() {
//
// }
// void deserialize_column_index(int64_t start_offset, tparquet::ColumnIndex) {
//
// }
//
// void deserialize_offset_index(int64_t start_offset, tparquet::OffsetIndex) {
//
// }
} // namespace doris::vectorized

View File

@ -0,0 +1,33 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "schema_desc.h"
namespace doris::vectorized {
SchemaDescriptor::~SchemaDescriptor() {
// fields.clear();
}
std::string SchemaDescriptor::debug_string() const {
return std::string();
}
std::string FieldSchema::debug_string() const {
return std::string();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,46 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "common/status.h"
namespace doris::vectorized {
class FieldSchema {
public:
int16_t max_def_level() const { return _max_def_level; }
int16_t max_rep_level() const { return _max_rep_level; }
std::string debug_string() const;
private:
int16_t _max_def_level;
int16_t _max_rep_level;
// std::vector<FieldSchema> children;
};
class SchemaDescriptor {
public:
SchemaDescriptor() = default;
~SchemaDescriptor();
std::string debug_string() const;
private:
// std::vector<FieldSchema> fields;
};
} // namespace doris::vectorized

View File

@ -0,0 +1,36 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vparquet_column_chunk_reader.h"
namespace doris::vectorized {
Status ColumnChunkReader::init() {
return Status();
}
Status ColumnChunkReader::read_min_max_stat() {
return Status();
}
Status ColumnChunkReader::decode_dict_page() {
return Status();
}
Status ColumnChunkReader::decode_nested_page() {
return Status();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,33 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <common/status.h>
namespace doris::vectorized {
class ColumnChunkReader {
public:
Status init();
Status read_min_max_stat();
Status decode_dict_page();
Status decode_nested_page();
private:
};
} // namespace doris::vectorized

View File

@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vparquet_file_metadata.h"
#include <sstream>
namespace doris::vectorized {
FileMetaData::FileMetaData(tparquet::FileMetaData& metadata) : _metadata(metadata) {
_num_rows = metadata.num_rows;
_num_groups = metadata.row_groups.size();
if (_num_groups != 0) {
_num_columns = metadata.row_groups[0].columns.size();
}
if (metadata.schema[0].num_children <= 0) {
}
}
Status FileMetaData::init_schema() {
return Status();
}
const tparquet::FileMetaData& FileMetaData::to_thrift_metadata() {
return _metadata;
}
std::string FileMetaData::debug_string() const {
std::stringstream out;
out << "Parquet Metadata(";
out << "; version=" << _metadata.version;
out << "; num row groups=" << _num_groups;
out << "; num rows=" << _num_rows;
out << ")";
return out.str();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,45 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "common/status.h"
#include "gen_cpp/parquet_types.h"
#include "schema_desc.h"
namespace doris::vectorized {
class FileMetaData {
public:
FileMetaData(tparquet::FileMetaData& metadata);
~FileMetaData() = default;
Status init_schema();
const tparquet::FileMetaData& to_thrift_metadata();
int32_t num_row_groups() const { return _num_groups; }
int32_t num_columns() const { return _num_columns; };
int32_t num_rows() const { return _num_rows; };
SchemaDescriptor schema() const { return _schema; };
std::string debug_string() const;
private:
tparquet::FileMetaData _metadata;
int32_t _num_groups = 0;
int32_t _num_columns = 0;
int64_t _num_rows = 0;
SchemaDescriptor _schema;
};
} // namespace doris::vectorized

View File

@ -0,0 +1,20 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vparquet_group_reader.h"
namespace doris::vectorized {} // namespace doris::vectorized

View File

@ -0,0 +1,24 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <common/status.h>
namespace doris::vectorized {
class RowGroupReader {};
} // namespace doris::vectorized

View File

@ -0,0 +1,29 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vparquet_page_index.h"
namespace doris::vectorized {
Status PageIndex::get_row_range_for_page() {
return Status();
}
Status PageIndex::collect_skipped_page_range() {
return Status();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,35 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <common/status.h>
#include <gen_cpp/parquet_types.h>
namespace doris::vectorized {
class PageIndex {
public:
Status get_row_range_for_page();
Status collect_skipped_page_range();
//private:
// // row range define
// tparquet::ColumnIndex _column_index;
// tparquet::OffsetIndex _offset_index;
};
} // namespace doris::vectorized

View File

@ -0,0 +1,33 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vparquet_page_reader.h"
namespace doris::vectorized {
Status PageReader::read_page_header() {
return Status();
}
Status PageReader::read_page_data() {
return Status();
}
Status PageReader::init() {
return Status();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,34 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <common/status.h>
#include <gen_cpp/parquet_types.h>
namespace doris::vectorized {
class PageReader {
public:
Status init();
Status read_page_header();
Status read_page_data();
//private:
// tparquet::PageHeader* _page_header;
};
} // namespace doris::vectorized

View File

@ -0,0 +1,61 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vparquet_reader.h"
#include "parquet_thrift_util.h"
namespace doris::vectorized {
doris::vectorized::ParquetReader::ParquetReader(doris::FileReader* file_reader, int64_t batch_size,
int32_t num_of_columns_from_file,
int64_t range_start_offset, int64_t range_size) {
// : _batch_size(batch_size), _num_of_columns_from_file(num_of_columns_from_file) {
_file_reader = file_reader;
_total_groups = 0;
// _current_group = 0;
// _statistics = std::make_shared<Statistics>();
}
doris::vectorized::ParquetReader::~ParquetReader() {
// _batch.clear();
}
Status ParquetReader::init_reader(const TupleDescriptor* tuple_desc,
const std::vector<SlotDescriptor*>& tuple_slot_descs,
const std::vector<ExprContext*>& conjunct_ctxs,
const std::string& timezone) {
_file_reader->open();
RETURN_IF_ERROR(parse_thrift_footer(_file_reader, _file_metadata));
auto metadata = _file_metadata->to_thrift_metadata();
_total_groups = metadata.row_groups.size();
if (_total_groups == 0) {
return Status::EndOfFile("Empty Parquet File");
}
return Status::OK();
}
int64_t ParquetReader::_get_row_group_start_offset(const tparquet::RowGroup& row_group) {
if (row_group.__isset.file_offset) {
return row_group.file_offset;
}
const tparquet::ColumnMetaData& first_column = row_group.columns[0].meta_data;
return first_column.data_page_offset;
}
} // namespace doris::vectorized

View File

@ -0,0 +1,87 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <parquet/api/reader.h>
#include <parquet/api/writer.h>
#include <parquet/exception.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "common/status.h"
#include "exprs/expr_context.h"
#include "gen_cpp/PaloBrokerService_types.h"
#include "gen_cpp/PlanNodes_types.h"
#include "gen_cpp/Types_types.h"
#include "gen_cpp/parquet_types.h"
#include "io/file_reader.h"
#include "vec/core/block.h"
#include "vparquet_file_metadata.h"
namespace doris::vectorized {
// struct Statistics {
// int32_t filtered_row_groups = 0;
// int32_t total_groups = 0;
// int64_t filtered_rows = 0;
// int64_t total_rows = 0;
// int64_t filtered_total_bytes = 0;
// int64_t total_bytes = 0;
// };
class ParquetReader {
public:
ParquetReader(FileReader* file_reader, int64_t batch_size, int32_t num_of_columns_from_file,
int64_t range_start_offset, int64_t range_size);
~ParquetReader();
virtual Status init_reader(const TupleDescriptor* tuple_desc,
const std::vector<SlotDescriptor*>& tuple_slot_descs,
const std::vector<ExprContext*>& conjunct_ctxs,
const std::string& timezone) = 0;
virtual Status next_batch(bool* eof) = 0;
// std::shared_ptr<Statistics>& statistics() { return _statistics; }
void close() {};
int64_t size(int64_t* size) { return _file_reader->size(); }
private:
int64_t _get_row_group_start_offset(const tparquet::RowGroup& row_group);
private:
FileReader* _file_reader;
std::shared_ptr<FileMetaData> _file_metadata;
// const int64_t _batch_size;
// const int32_t _num_of_columns_from_file;
int _total_groups; // num of groups(stripes) of a parquet(orc) file
// int _current_group; // current group(stripe)
// std::map<std::string, int> _map_column; // column-name <---> column-index
// std::vector<int> _include_column_ids; // columns that need to get from file
// std::shared_ptr<Statistics> _statistics;
// parquet file reader object
// std::vector<Block*> _batch;
// std::string _timezone;
// int64_t _range_start_offset;
// int64_t _range_size;
private:
std::atomic<bool> _closed = false;
};
} // namespace doris::vectorized

View File

@ -59,6 +59,7 @@ set(EXEC_TEST_FILES
exec/s3_reader_test.cpp
exec/multi_bytes_separator_test.cpp
exec/hdfs_file_reader_test.cpp
vec/exec/parquet/parquet_thrift_test.cpp
# exec/new_olap_scan_node_test.cpp
# exec/pre_aggregation_node_test.cpp
# exec/partitioned_hash_table_test.cpp

View File

@ -0,0 +1,66 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "io/buffered_reader.h"
#include "io/file_reader.h"
#include "io/local_file_reader.h"
#include "util/runtime_profile.h"
#include "vec/exec/format/parquet/parquet_thrift_util.h"
#include "vec/exec/format/parquet/vparquet_file_metadata.h"
namespace doris {
namespace vectorized {
class ParquetThriftReaderTest : public testing::Test {
public:
ParquetThriftReaderTest() {}
};
TEST_F(ParquetThriftReaderTest, normal) {
LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/localfile.parquet", 0);
auto st = reader.open();
EXPECT_TRUE(st.ok());
std::shared_ptr<FileMetaData> metaData;
parse_thrift_footer(&reader, metaData);
tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata();
LOG(WARNING) << "num row groups: " << metaData->num_row_groups();
LOG(WARNING) << "num columns: " << metaData->num_columns();
LOG(WARNING) << "=====================================";
for (auto value : t_metadata.row_groups) {
LOG(WARNING) << "row group num_rows: " << value.num_rows;
}
LOG(WARNING) << "=====================================";
for (auto value : t_metadata.schema) {
LOG(WARNING) << "schema column name: " << value.name;
LOG(WARNING) << "schema column type: " << value.type;
LOG(WARNING) << "schema column repetition_type: " << value.repetition_type;
LOG(WARNING) << "schema column num children: " << value.num_children;
}
}
} // namespace vectorized
} // namespace doris